# Stock Market Data Analysis

This notebook processes, cleans, and visualizes historical stock market data.
It generates summary statistics, plots, and interactive visualizations.

1. Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

2. Confugration

In [2]:
CSV_FILE        = "combined_stock_data.csv"
START_DATE      = "2019-01-01"
END_DATE        = "2021-12-31"
SUMMARY_FILE    = "summary_stats_filtered.csv"

3. Define Functions

In [3]:
def remove_outliers_iqr(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """Remove outliers from a given column using the IQR method."""
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

def save_plot(fig, filename: str):
    """Save and close a Matplotlib figure."""
    fig.savefig(filename)
    plt.close(fig)


4. Load and Clean

In [4]:
# Load & validate data
file_path = CSV_FILE
try:
    df = pd.read_csv(file_path, parse_dates=['Date'])
except FileNotFoundError:
    raise SystemExit('Error: File not found. Please check the path.')
except pd.errors.ParserError:
    raise SystemExit('Error: File format invalid or corrupted.')
except Exception as err:
    raise SystemExit(f'Unexpected error loading data: {err}')
stock_data = df
df = stock_data
required = {'Date', 'Company', 'Close', 'Volume'}

if not required.issubset(df.columns):
    missing = required - set(df.columns)
    raise SystemExit(f'Error: Missing columns: {missing}')

# Basic cleaning
stock_data.dropna(inplace=True)
stock_data.sort_values(by="Date", inplace=True)

# Remove outliers properly
stock_data = remove_outliers_iqr(stock_data, "Volume")
stock_data = remove_outliers_iqr(stock_data, "Close")

# Filter date range
df = stock_data
start = START_DATE
end = END_DATE

start_dt, end_dt = (pd.to_datetime(start), pd.to_datetime(end))
if start_dt > end_dt:
    raise SystemExit('Error: Start date is after end date.')
stock_data = df[(df['Date'] >= start_dt) & (df['Date'] <= end_dt)].copy()

# Show first rows after cleaning
stock_data.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Company
6597,2019-01-02,32.110001,33.470001,32.110001,33.450001,33.355457,78900,ABTX
3644,2019-01-02,39.689999,39.689999,38.939999,39.299999,38.04821,235000,AAT
1325,2019-01-02,15.0,16.290001,14.85,15.88,15.88,478300,AAOI
6598,2019-01-03,33.380001,33.84,33.060001,33.549999,33.455173,62800,ABTX
1326,2019-01-03,15.51,15.8,15.015,15.06,15.06,398400,AAOI


5. Quick Overview

In [5]:
stock_data["Daily_Change"] = stock_data.groupby("Company")["Close"].pct_change() * 100
summary_stats = stock_data.groupby("Company")["Close"].agg(
    mean="mean", median="median", max="max", min="min", std="std"
)
summary_stats.to_csv(SUMMARY_FILE)
summary_stats

Unnamed: 0_level_0,mean,median,max,min,std
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAOI,11.362349,10.935,17.790001,5.5,2.463865
AAT,44.627524,46.119999,48.959999,21.77,4.945657
ABBV,72.973889,74.389999,78.169998,62.98,4.689739
ABTX,34.496,35.09,38.799999,22.24,3.056954


6. Visualizations

In [6]:
# Line plot: Closing price over time
fig, ax = plt.subplots(figsize=(14, 7))
for company in stock_data["Company"].unique():
    subset = stock_data[stock_data["Company"] == company]
    ax.plot(subset["Date"], subset["Close"], label=company)
ax.set_title("Closing Prices Over Time")
ax.set_xlabel("Date")
ax.set_ylabel("Close Price (USD)")
ax.legend()
save_plot(fig, "line_plot.png")

# Histogram: Daily % change
fig, ax = plt.subplots(figsize=(14, 7))
ax.hist(stock_data["Daily_Change"].dropna(), edgecolor="black")
ax.set_title("Distribution of Daily Changes")
ax.set_xlabel("Change (%)")
save_plot(fig, "histogram.png")

# Scatter plot: Volume vs Close with size representing % change
fig, ax = plt.subplots(figsize=(14, 7))
for company in stock_data["Company"].unique():
    subset = stock_data[stock_data["Company"] == company]
    ax.scatter(
        subset["Volume"],
        subset["Close"],
        s=subset["Daily_Change"].abs().fillna(0) * 5,  
        alpha=0.5,
        label=company
    )
ax.set_title("Volume vs. Close Price (Bubble size = % Change)")
ax.set_xlabel("Volume (shares)")
ax.set_ylabel("Close Price (USD)")
ax.legend()
save_plot(fig, "scatter_plot.png")

# Interactive plot: Closing price trends
fig_px = px.line(
    stock_data, x="Date", y="Close", color="Company",
    title="Interactive Trend of Closing Price"
)
fig_px.write_html("interactive_plot.html")


In [7]:
print("Analysis complete. Results saved in current directory.")


Analysis complete. Results saved in current directory.
