In [10]:
import pandas as pd

# STEP 1: Load the datasets
### We read the financial dataset (box office + budget) and IMDb dataset (ratings + votes + language)
### These files are collected from real web sources and stored locally.

In [11]:
financial["Title_clean"] = financial["Title"].str.lower().str.strip()
imdb["Title_clean"] = imdb["Title"].str.lower().str.strip()


# STEP 3: Convert Release Date to datetime format
### We use `format="mixed"` to allow different date formats in the same column.
### `errors="coerce"` will convert invalid or unknown dates to NaT instead of crashing.




In [12]:
financial["Release Date"] = pd.to_datetime(
    financial["Release Date"],
    errors="coerce",
    format="mixed"
)

# STEP 4: Extract the year from Release Date
### This will be used as part of the merge key, because many movies share the same title
### but are different films from different years.

In [13]:
financial["Year"] = financial["Release Date"].dt.year

# STEP 5: Merge financial + IMDb using the cleaned title and year
### We use `how="inner"` meaning:
###             Only movies that exist in BOTH datasets will be kept.
### This avoids merging unrelated movies.

In [14]:
merged = pd.merge(
    financial, imdb, how="inner", on=["Title_clean", "Year"]
)


# STEP 6: Fix duplicate Title columns created by merge
### Pandas adds _x and _y when two files have the same column name.
### We:
###  - Drop "Title_y" (IMDb original Title)
###  - Rename "Title_x" back to "Title" (financial original Title)


In [15]:
merged = merged.drop(columns=["Title_y"], errors="ignore")
merged = merged.rename(columns={"Title_x": "Title"})

# STEP 7: Remove the helper column "Title_clean"
### We no longer need it after merging.

In [None]:
merged = merged.drop(columns=["Title_clean"], errors="ignore")


# STEP 8: add season cloumn

In [28]:
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Fall"
    else:
        return None

merged["Season"] = merged["Release Date"].dt.month.apply(get_season)


# STEP 8: Save the merged dataset for later use
### This file now contains:
###  Release Date, Title, Budget, Domestic Gross, Worldwide Gross, Rating, Vote Count, Language, Year


In [29]:
merged.to_csv(r"E:\semester 5\Data Analysis\movie_data_analysis\data\movies_dataset_final.csv", index=False)

# STEP 9: Reload the cleaned merged dataset to continue integration

In [30]:
df_mine = pd.read_csv(r"E:\semester 5\Data Analysis\movie_data_analysis\data\movies_dataset_final.csv")

# STEP 10: Load Rotten Tomatoes dataset
### This dataset contains critic and audience scores, which we want to integrate too.

In [31]:
df_rt = pd.read_csv(r"E:\semester 5\Data Analysis\movie_data_analysis\data\rotten_tomatoes_movies.csv")

# STEP 11: Keep only the useful columns from Rotten Tomatoes
### We ignore unnecessary columns to keep the dataset clean and analysis-ready.

In [32]:
df_rt_clean = df_rt[['movie_title', 'original_release_date', 'tomatometer_rating', 'audience_rating']].copy()


# STEP 12: Normalize titles again for RT dataset
### This ensures RT titles follow the same format as the other datasets.

In [33]:
df_rt_clean["Title_clean"] = df_rt_clean["movie_title"].str.lower().str.strip()

# STEP 13: Extract the release year from Rotten Tomatoes release date
### This will allow us to match movies across all 3 sources.

In [34]:
df_rt_clean['Year'] = pd.to_datetime(df_rt_clean['original_release_date'], errors="coerce").dt.year


# STEP 14: Rename RT columns to clearer names
### This makes them easier to use in analysis and avoids confusion.

In [35]:
df_rt_clean = df_rt_clean.rename(columns={
    'movie_title': 'Title',
    'tomatometer_rating': 'Critic_Score',
    'audience_rating': 'Audience_Score'
})

# STEP 15: Remove the original helper column we used for cleaning

In [36]:
df_rt_clean = df_rt_clean.drop(columns=["movie_title"], errors="ignore")


# STEP 16: Final merge â†’ integrate financial+IMDb dataset with Rotten Tomatoes
### Again using `inner` to keep only movies found in ALL datasets.
### This final dataset is now fully integrated and ready for prediction analysis.

In [37]:
final = pd.merge(df_mine, df_rt_clean, on=['Title', 'Year'], how='inner')

# STEP 17: Save the fully integrated dataset
### This is the dataset you will use in your project for EDA and modeling.

In [38]:
final.to_csv(r"E:\semester 5\Data Analysis\movie_data_analysis\data\movies_project_final_dataset.csv", index=False)


# STEP 18: Display final dataset size and structure
### This confirms merge success and shows final columns + data types.

In [39]:

print("Final dataset shape:", final.shape)
final.info()

Final dataset shape: (1998, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Release Date           1998 non-null   object 
 1   Title                  1998 non-null   object 
 2   Budget                 1998 non-null   object 
 3   Domestic Gross         1998 non-null   object 
 4   Worldwide Gross        1998 non-null   object 
 5   Year                   1998 non-null   float64
 6   Rating                 1998 non-null   float64
 7   Vote Count             1998 non-null   int64  
 8   Language               1998 non-null   object 
 9   Season                 1998 non-null   object 
 10  original_release_date  1998 non-null   object 
 11  Critic_Score           1995 non-null   float64
 12  Audience_Score         1995 non-null   float64
 13  Title_clean            1998 non-null   object 
dtypes: float64(4), int64(1),