In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
# Ensure plots show in the notebook
%matplotlib inline

In [46]:
import os

# Create directory for plots if it doesn't exist
os.makedirs('distribution_plots', exist_ok=True)

In [47]:
# Read the CSV file
df = pd.read_csv('NFLX.csv')

1. DATA OVERVIEW

In [48]:
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print("\nColumn Data Types:")
print(df.dtypes)

Number of Rows: 1009
Number of Columns: 7

Column Data Types:
Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


2. DATA QUALITY ASSESSMENT

In [49]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [50]:
# Identify duplicate records
duplicates = df.duplicated()
print(f"\nNumber of Duplicate Records: {duplicates.sum()}")
if duplicates.sum() > 0:
    print("Duplicate Rows:")
    print(df[duplicates])


Number of Duplicate Records: 0


3. DATA PREPROCESSING

In [51]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [52]:
# Interpolate missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
df_processed = df.copy()
df_processed[numeric_columns] = df_processed[numeric_columns].interpolate()

print("Preprocessing Complete:")
print("- Date column converted to datetime")
print("- Missing values interpolated")

Preprocessing Complete:
- Date column converted to datetime
- Missing values interpolated


In [53]:
# Save processed data
df_processed.to_csv('preprocessed_exp.csv', index=False)
print("\nPreprocessed data saved to 'preprocessed_DATA.csv'")


Preprocessed data saved to 'preprocessed_DATA.csv'


4. DESCRIPTIVE STATISTICS

In [54]:
# Detailed descriptive statistics
desc_stats = df_processed[numeric_columns].describe()
print(desc_stats)

              Open         High          Low        Close    Adj Close  \
count  1009.000000  1009.000000  1009.000000  1009.000000  1009.000000   
mean    419.059673   425.320703   412.374044   419.000733   419.000733   
std     108.537532   109.262960   107.555867   108.289999   108.289999   
min     233.919998   250.649994   231.229996   233.880005   233.880005   
25%     331.489990   336.299988   326.000000   331.619995   331.619995   
50%     377.769989   383.010010   370.880005   378.670013   378.670013   
75%     509.130005   515.630005   502.529999   509.079987   509.079987   
max     692.349976   700.989990   686.090027   691.690002   691.690002   

             Volume  
count  1.009000e+03  
mean   7.570685e+06  
std    5.465535e+06  
min    1.144000e+06  
25%    4.091900e+06  
50%    5.934500e+06  
75%    9.322400e+06  
max    5.890430e+07  


5. DISTRIBUTION ANALYSIS

In [55]:
# 5. Distribution Analysis
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(2, 3, i)
    
    # Histogram with KDE
    sns.histplot(df_processed[col], kde=True)
    plt.title(f'{col} Distribution')
    plt.tight_layout()

plt.savefig('distribution_plots/histograms.png')
plt.close()

In [56]:
# Box Plots
plt.figure(figsize=(15, 5))
df_processed[numeric_columns].plot(kind='box')
plt.title('Box Plot of Numeric Columns')
plt.tight_layout()
plt.savefig('distribution_plots/boxplots.png')
plt.close()

<Figure size 1080x360 with 0 Axes>

6. RELATIONSHIP EXPLORATION

In [57]:
# 6. Relationship Exploration
# Correlation Matrix
correlation_matrix = df_processed[numeric_columns].corr()

In [58]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('distribution_plots/correlation_heatmap.png')
plt.close()

print(correlation_matrix)

               Open      High       Low     Close  Adj Close    Volume
Open       1.000000  0.998605  0.998508  0.996812   0.996812 -0.415838
High       0.998605  1.000000  0.998203  0.998551   0.998551 -0.400699
Low        0.998508  0.998203  1.000000  0.998544   0.998544 -0.432116
Close      0.996812  0.998551  0.998544  1.000000   1.000000 -0.413362
Adj Close  0.996812  0.998551  0.998544  1.000000   1.000000 -0.413362
Volume    -0.415838 -0.400699 -0.432116 -0.413362  -0.413362  1.000000


In [59]:
# Display the first few rows of processed data
print("\nProcessed Data Preview:")
display(df_processed.head())


Processed Data Preview:


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-02-05,262.0,267.899994,250.029999,254.259995,254.259995,11896100
1,2018-02-06,247.699997,266.700012,245.0,265.720001,265.720001,12595800
2,2018-02-07,266.579987,272.450012,264.329987,264.559998,264.559998,8981500
3,2018-02-08,267.079987,267.619995,250.0,250.100006,250.100006,9306700
4,2018-02-09,253.850006,255.800003,236.110001,249.470001,249.470001,16906900
