In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv(r'D:\Stock_trend_project\data\raw\train.csv')

sns.set()

unique_tickers = train['Ticker'].dropna().unique()
n_tickers_sample = 1000
sampled_tickers = np.random.choice(unique_tickers,
                                   size=min(n_tickers_sample, len(unique_tickers)),
                                   replace=False)

train_ticker_sample = train[train['Ticker'].isin(sampled_tickers)].copy()

train_ticker_sample['Date'] = pd.to_datetime(train_ticker_sample['Date'])

train_ticker_sample = train_ticker_sample.sort_values(['Ticker', 'Date'])

def sample_per_ticker(df, n_per_ticker=500):

    idx = np.linspace(0, len(df) - 1, num=min(n_per_ticker, len(df))).astype(int)
    return df.iloc[idx]

n_per_ticker = 500


eda_sample = (
    train_ticker_sample
    .groupby('Ticker', group_keys=True)
    .apply(sample_per_ticker, n_per_ticker=n_per_ticker, include_groups=False)
)
eda_sample = eda_sample.reset_index(level=0).rename(columns={"level_0": "Ticker"})


## Computing daily returns per stock

This code sorts the data by company and date, then computes the daily percentage return for each company as (Closet−Closet−1)/Closet−1(Closet−Closet−1)/Closet−1, drops the first day of each company (where the return is NaN), and finally shows a preview of the resulting DailyReturn alongside the original Close prices.

In [5]:

returns_df = train.sort_values(["Ticker", "Date"]).copy()

# Return_t = (Close_t - Close_{t-1}) / Close_{t-1}
returns_df["DailyReturn"] = (
    returns_df
    .groupby("Ticker")["Close"]
    .pct_change()
)

# 3) حذف أول صف لكل شركة (عائده NaN)
returns_df = returns_df.dropna(subset=["DailyReturn"])

returns_df[["Ticker", "Date", "Close", "DailyReturn"]].head(10)


Unnamed: 0,Ticker,Date,Close,DailyReturn
25,ticker_1,1962-01-03,0.261788,0.0
49,ticker_1,1962-01-04,0.26098,-0.003086
76,ticker_1,1962-01-05,0.255324,-0.021673
97,ticker_1,1962-01-08,0.256536,0.004747
113,ticker_1,1962-01-09,0.256132,-0.001575
129,ticker_1,1962-01-10,0.255324,-0.003155
144,ticker_1,1962-01-11,0.256536,0.004747
171,ticker_1,1962-01-12,0.258556,0.007874
198,ticker_1,1962-01-15,0.259364,0.003125
219,ticker_1,1962-01-16,0.255728,-0.014018


## Inspecting daily return behaviour

This block summarizes the DailyReturn feature by: reporting basic descriptive statistics, measuring the fraction of days with positive vs negative returns, counting extreme daily moves larger than 20% in absolute value, counting days with exactly zero return, and listing the top 10 most positive and the top 10 most negative daily returns to inspect the most extreme cases.

In [6]:

returns_df["DailyReturn"].describe()

(returns_df["DailyReturn"] > 0).mean(), (returns_df["DailyReturn"] < 0).mean()

(returns_df["DailyReturn"].abs() > 0.2).sum()

returns_df[returns_df["DailyReturn"] == 0].shape

returns_df.nlargest(10, "DailyReturn")
returns_df.nsmallest(10, "DailyReturn")


Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DailyReturn
18529921,ticker_4051,2022-09-15,0.17,0.17,0.17,0.17,0.0,0.0,0.0,-0.98
11406743,ticker_2489,2014-09-03,7.5,7.5,7.5,7.5,1.0,0.0,0.0,-0.975
5173867,ticker_504,2003-03-26,0.383368,0.447262,0.319473,0.351421,22074378.0,0.0,0.0,-0.971867
19417766,ticker_4274,2023-06-08,0.208,0.215,0.171,0.185,2394600.0,0.0,0.0,-0.96209
10856590,ticker_2188,2013-11-08,3.8,5.5,3.8,5.4,1480.0,0.0,0.0,-0.946054
2853648,ticker_948,1996-07-30,4.76,4.76,4.76,4.76,95.0,0.0,0.0,-0.933333
20695655,ticker_4361,2024-06-17,2.41,2.51,1.5,1.65,25207800.0,0.0,0.0,-0.932981
8205243,ticker_2188,2009-06-22,2.0,2.0,1.0,1.2,3500.0,0.0,0.0,-0.929412
1267459,ticker_349,1988-12-05,2.169958,2.789946,2.169958,2.169958,15162.0,0.0,0.0,-0.927083
20892410,ticker_4822,2024-08-13,3.35,4.42,2.65,3.2,2416000.0,0.0,0.0,-0.920988
