# 🎬 Movie Data Analysis Project
This notebook performs data cleaning, transformation, and visualization on a movie dataset.

## 📦 Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 📂 Load Dataset

In [None]:
df = pd.read_csv('mymoviedb.csv', lineterminator='\n')
df.head()

## ℹ️ Dataset Information

In [None]:
df.info()

## 🎭 Explore Genre Column

In [None]:
df['Genre'].head()

## 🔍 Check for Duplicates

In [None]:
df.duplicated().sum()

## 📊 Summary Statistics

In [None]:
df.describe()

## 🧹 Data Cleaning
Convert `Release_Date` to year and drop unnecessary columns.

In [None]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'])
df['Release_Date'] = df['Release_Date'].dt.year
cols = ['Overview', 'Original_Language', 'Poster_Url']
df.drop(cols, axis=1, inplace=True)
df.head()

## 🏷️ Categorize `Vote_Average` into Labels

In [None]:
def catigorize_col(df, col, labels):
    edges = [df[col].describe()['min'],
             df[col].describe()['25%'],
             df[col].describe()['50%'],
             df[col].describe()['75%'],
             df[col].describe()['max']]
    df[col] = pd.cut(df[col], edges, labels=labels, duplicates='drop')
    return df

labels = ['not_popular', 'below_avg', 'average', 'popular']
df = catigorize_col(df, 'Vote_Average', labels)
df['Vote_Average'].value_counts()

## 🚫 Drop Missing Values

In [None]:
df.dropna(inplace=True)
df.isna().sum()

## 💥 Explode Genre Column

In [None]:
df['Genre'] = df['Genre'].str.split(', ')
df = df.explode('Genre').reset_index(drop=True)
df['Genre'] = df['Genre'].astype('category')
df.head()

## 📋 Dataset Summary After Cleaning

In [None]:
df.info()
df.nunique()

## 📈 Data Visualizations

### 🎭 Genre Distribution

In [None]:
sns.set_style('whitegrid')
sns.catplot(y='Genre', data=df, kind='count', order=df['Genre'].value_counts().index, color='#4287f5')
plt.title('Genre Column Distribution')
plt.show()

### ⭐ Vote Average Category Distribution

In [None]:
sns.catplot(y='Vote_Average', data=df, kind='count', order=df['Vote_Average'].value_counts().index, color='#4287f5')
plt.title('Votes Distribution')
plt.show()

### 🔝 Most Popular Movie

In [None]:
df[df['Popularity'] == df['Popularity'].max()]

### 🔻 Least Popular Movie

In [None]:
df[df['Popularity'] == df['Popularity'].min()]

### 📅 Release Year Distribution

In [None]:
df['Release_Date'].hist()
plt.title('Release Year Distribution')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.show()