# Netflix Stock Data Processing

In [164]:
from pandas_datareader import data, wb
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import kagglehub
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

### Data Cleaning

In [165]:
path = kagglehub.dataset_download("matiflatif/netflix-complete-stock-dataweekly-updated")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\lixin\.cache\kagglehub\datasets\matiflatif\netflix-complete-stock-dataweekly-updated\versions\6


In [166]:
import os

dataset_path = r"C:\Users\lixin\.cache\kagglehub\datasets\matiflatif\netflix-complete-stock-dataweekly-updated\versions\6"

files = os.listdir(dataset_path)
print(files)

['NFLX_2002-05-23_2025-01-31.csv', 'NFLX_2002-05-23_2025-02-07.csv', 'NFLX_2002-05-23_2025-02-18.csv', 'NFLX_2002-05-23_2025-02-24.csv', 'NFLX_2002-05-23_2025-03-02 (1).csv']


In [167]:
file_path = os.path.join(dataset_path, "NFLX_2002-05-23_2025-03-02 (1).csv")
df = pd.read_csv(file_path)

In [168]:
df.head()

Unnamed: 0,date,open,high,low,close,adj_close,volume
0,2002-05-23 00:00:00-04:00,1.156429,1.242857,1.145714,1.196429,1.196429,104790000
1,2002-05-24 00:00:00-04:00,1.214286,1.225,1.197143,1.21,1.21,11104800
2,2002-05-28 00:00:00-04:00,1.213571,1.232143,1.157143,1.157143,1.157143,6609400
3,2002-05-29 00:00:00-04:00,1.164286,1.164286,1.085714,1.103571,1.103571,6757800
4,2002-05-30 00:00:00-04:00,1.107857,1.107857,1.071429,1.071429,1.071429,10154200


In [169]:
df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)

In [170]:
print(df['date'].dtype)  
print(df.head())

datetime64[ns, UTC]
                       date      open      high       low     close  \
0 2002-05-23 04:00:00+00:00  1.156429  1.242857  1.145714  1.196429   
1 2002-05-24 04:00:00+00:00  1.214286  1.225000  1.197143  1.210000   
2 2002-05-28 04:00:00+00:00  1.213571  1.232143  1.157143  1.157143   
3 2002-05-29 04:00:00+00:00  1.164286  1.164286  1.085714  1.103571   
4 2002-05-30 04:00:00+00:00  1.107857  1.107857  1.071429  1.071429   

   adj_close     volume  
0   1.196429  104790000  
1   1.210000   11104800  
2   1.157143    6609400  
3   1.103571    6757800  
4   1.071429   10154200  


In [171]:
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

In [172]:
df['date'] = pd.to_datetime(df['date'])

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5730 entries, 0 to 5729
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5730 non-null   datetime64[ns]
 1   open       5730 non-null   float64       
 2   high       5730 non-null   float64       
 3   low        5730 non-null   float64       
 4   close      5730 non-null   float64       
 5   adj_close  5730 non-null   float64       
 6   volume     5730 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 313.5 KB


In [174]:
df.describe()

Unnamed: 0,date,open,high,low,close,adj_close,volume
count,5730,5730.0,5730.0,5730.0,5730.0,5730.0,5730.0
mean,2013-10-08 12:34:40.837696256,161.690117,164.06279,159.262061,161.720848,161.720848,15285120.0
min,2002-05-23 00:00:00,0.377857,0.410714,0.346429,0.372857,0.372857,285600.0
25%,2008-01-31 06:00:00,4.266428,4.357143,4.1875,4.261429,4.261429,5456225.0
50%,2013-10-08 12:00:00,45.32,46.216429,44.207144,45.046429,45.046429,9529150.0
75%,2019-06-18 18:00:00,309.107491,314.087494,303.524986,309.62751,309.62751,18198820.0
max,2025-02-28 00:00:00,1060.0,1064.5,1041.689941,1058.599976,1058.599976,323414000.0
std,,213.930023,216.729422,211.021914,213.921159,213.921159,18448380.0


In [175]:
df.isnull().sum()

date         0
open         0
high         0
low          0
close        0
adj_close    0
volume       0
dtype: int64

In [176]:
df.dropna(inplace=True)

In [None]:
df.to_csv("cleaned_Netflix_stocks.csv", index=False)

### EDA

In [177]:
df.head()

Unnamed: 0,date,open,high,low,close,adj_close,volume
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200


In [178]:
df['daily_return'] = df['close'].pct_change()

In [179]:
df['volatility'] = df['daily_return'].rolling(window=30).std()

In [180]:
df['MA30'] = df['close'].rolling(window=30).mean()

fig_ma30 = px.line(df, x='date', y=['close', 'MA30'], 
                   labels={'value': 'Stock Price (USD)', 'date': 'Date'},
                   title="Netflix Stock Price with 30-day Moving Average")

fig_ma30.show()

In [181]:
df['daily_return'] = df['close'].pct_change()

fig_hist = px.histogram(df, x='daily_return', marginal='box', nbins=100, opacity=0.75,title="Distribution of Netflix Stock Daily Returns")

fig_hist.show()

In [182]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df['date'], y=df['close'], name="Stock Price", line=dict(color='blue')))

fig.add_trace(go.Scatter(x=df['date'], y=df['volatility'], name="Volatility", yaxis="y2",
                         fill='tozeroy', line=dict(color='red', width=1)))

fig.update_layout(
    title="Netflix Stock Price and Volatility Over Time",
    autosize=True,
    margin=dict(b=50),
    xaxis_title="Date",
    yaxis=dict(title="Stock Price (USD)"),
    yaxis2=dict(title="Volatility", overlaying="y", side="right", range=[0, 0.15]),
    legend=dict(x=0.5, y=-0.3, xanchor="center", yanchor="top", orientation="h")
)

fig.show()

In [183]:
df_recent = df[df['date'] >= '2022-01-01']

fig_candlestick = go.Figure(data=[go.Candlestick(x=df_recent['date'],
                                                 open=df_recent['open'],
                                                 high=df_recent['high'],
                                                 low=df_recent['low'],
                                                 close=df_recent['close'])])

fig_candlestick.update_layout(
    title="Netflix Candlestick Chart (Last 3 Years)",
    xaxis=dict(title="Date", rangeslider=dict(visible=True)),
    yaxis=dict(title="Stock Price (USD)")
)

fig_candlestick.show()