In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [69]:
# Load the data
data = pd.read_csv('000905.csv')

## 1 Inspect the Data

In [36]:
#Inspect the Data: Begin by examining the structure and contents of the CSV file. This includes looking at the column headers, data types, and sample records to understand the data format.
print(data.head())

   Unnamed: 0      open      high      low     close       volume
0  2005-01-04   996.682   996.682  984.795   986.927  232376203.0
1  2005-01-05   986.570  1008.855  985.677  1003.633  348610113.0
2  2005-01-06  1003.490  1003.490  990.792   994.595  293390559.0
3  2005-01-07   993.434  1009.000  990.446   997.606  339162698.0
4  2005-01-10   996.928  1006.457  993.123  1006.337  294940533.0


In [17]:
print(data.columns)

Index(['Unnamed: 0', 'open', 'high', 'low', 'close', 'volume'], dtype='object')


In [70]:
# rename column 'Unnamed: 0' to 'Date'
data = data.rename(columns={'Unnamed: 0': 'Date'})

In [19]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    8034 non-null   object 
 1   open    8034 non-null   float64
 2   high    8034 non-null   float64
 3   low     8034 non-null   float64
 4   close   8034 non-null   float64
 5   volume  8034 non-null   float64
dtypes: float64(5), object(1)
memory usage: 376.7+ KB
None


In [20]:
print(data.describe())

               open          high           low         close        volume
count   8034.000000   8034.000000   8034.000000   8034.000000  8.034000e+03
mean    7122.675520   7197.789496   7045.453155   7126.946329  1.078476e+10
std     4409.544966   4453.433419   4360.452588   4412.009497  1.479926e+10
min      401.570000    408.020000    397.670000    402.500000  0.000000e+00
25%     3194.040000   3222.460000   3162.988000   3195.782000  5.312708e+08
50%     7320.574500   7381.802000   7268.354000   7332.967000  3.200612e+09
75%    10724.656250  10810.198000  10642.386000  10741.288250  1.680722e+10
max    19554.581000  19600.026000  19203.110000  19531.155000  7.668641e+10


## 2 Data Cleaning and Preprocessing

In [71]:
# how many Nas are in the data
print(data.isnull().sum())

# Handle missing values
# data.dropna(inplace=True)  # Drop rows with missing values

Date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64


In [39]:
# how many duplicates are in the data
print(data.duplicated().sum())

# Remove duplicates
# data.drop_duplicates(inplace=True)

0


In [72]:
# Data Cleaning

# Data Preprocessing
# Convert date column to datetime
data['Date'] = pd.to_datetime(data['Date'])
# Sort the DataFrame by date
data.sort_values(by='Date', inplace=True)
# Reset index
data.reset_index(drop=True, inplace=True)

# Additional preprocessing steps can be added here as needed

In [24]:
data.head()

Unnamed: 0,Date,open,high,low,close,volume
0,1991-04-03,988.05,988.05,988.05,988.05,100.0
1,1991-04-04,983.11,983.11,983.11,983.11,52300.0
2,1991-04-05,978.27,978.27,978.27,978.27,10700.0
3,1991-04-08,968.57,968.57,968.57,968.57,5700.0
4,1991-04-09,963.73,963.73,963.73,963.73,1900.0


## 3 Data Transformation

In [73]:
# Calculate daily returns
data['Daily_Return'] = data['close'].pct_change()

In [74]:
# Calculate moving averages (e.g., 10-day and 50-day moving averages)
data['MA_10'] = data['close'].rolling(window=10).mean()
data['MA_50'] = data['close'].rolling(window=50).mean()

In [75]:
# Calculate relative strength index (RSI)
def calculate_rsi(data, window=14):
    delta = data['close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

data['RSI'] = calculate_rsi(data)

In [76]:
# Logarithm the close
data['Log_Close'] = np.log(data['close'])

In [44]:
data.head()

Unnamed: 0,Date,open,high,low,close,volume,Daily_Return,MA_10,MA_50,RSI,Log_Close
0,2005-01-04,996.682,996.682,984.795,986.927,232376203.0,,,,,6.894596
1,2005-01-05,986.57,1008.855,985.677,1003.633,348610113.0,0.016927,,,,6.911382
2,2005-01-06,1003.49,1003.49,990.792,994.595,293390559.0,-0.009005,,,,6.902336
3,2005-01-07,993.434,1009.0,990.446,997.606,339162698.0,0.003027,,,,6.905358
4,2005-01-10,996.928,1006.457,993.123,1006.337,294940533.0,0.008752,,,,6.914072


## 4 Ploting

In [77]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Create figure
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=("Price and Moving Averages", "Log Close Price"))

# Adding traces for Price and Moving Averages
fig.add_trace(go.Scatter(x=data['Date'], y=data['close'], mode='lines', name='Close Price'), row=1, col=1)
fig.add_trace(go.Scatter(x=data['Date'], y=data['MA_10'], mode='lines', name='10-day MA'), row=1, col=1)
fig.add_trace(go.Scatter(x=data['Date'], y=data['MA_50'], mode='lines', name='50-day MA'), row=1, col=1)

# Adding trace for Log Close Price
fig.add_trace(go.Scatter(x=data['Date'], y=data['Log_Close'], mode='lines', name='Log Close Price'), row=2, col=1)

# Update y-axes labels
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_yaxes(title_text="Log Price", row=2, col=1)

# Update x-axis label and title
fig.update_xaxes(title_text="Date")

# Update layout
fig.update_layout(title="Stock Price with Moving Averages and Log Close Price",
                  hovermode="x unified",  # Show hover information for all traces at the same x-coordinate
                  showlegend=True)

# Show plot
fig.show()


## 5 Drawdown


In [78]:
from scipy.signal import find_peaks
# Extract data from the 'close' column
close_values = data['close']

# Use the find_peaks function to locate all peak positions, setting the minimum distance between peaks to 100 units
peaks_indices, _ = find_peaks(close_values, distance=100)

# Sort the indices based on the magnitude of the peaks
sorted_peaks_indices = sorted(peaks_indices, key=lambda x: close_values[x], reverse=True)

# Select the positions of the top 2 peaks
top_2_peaks_indices = sorted_peaks_indices[:2]

# Sort the selected two peaks based on their indices
top_2_peaks_indices_sorted = sorted(top_2_peaks_indices)

print("Positions of the top 2 peaks (sorted by index):", top_2_peaks_indices_sorted)


Positions of the top 2 peaks (sorted by index): [2534, 2644]


In [None]:
data[data]

In [51]:
start_date = top_2_peaks_indices[1]
start_date = data.iloc[start_date]['Date']
start_date


'2015-11-25'

In [80]:
# Split the data into 3 parts based on the selected two indices
data_1 = data.loc[:top_2_peaks_indices[0]]
data_2 = data.loc[top_2_peaks_indices[0]:top_2_peaks_indices[1]]
data_3 = data.loc[top_2_peaks_indices[1]:]

# Define a function to calculate the maximum drawdown
def calculate_max_drawdown(log_prices, window_size):

    
    max_drawdown = 0
    start_date = None
    end_date = None

    for i in range(len(log_prices) - window_size + 1):
        window = log_prices.iloc[i:i + window_size]
        peak = window.iloc[0]

        for date, log_price in window.items():
            if log_price > peak:
                peak = log_price
            else:
                drawdown = (log_price - peak) / peak
                if drawdown < max_drawdown:
                    max_drawdown = drawdown
                    start_date = window.index[0]  # Start date is the first date of the window
                    end_date = date  # End date is the current date
                    # print(type(start_date),type(end_date))

    return max_drawdown, start_date, end_date

# Iterate over the data parts and calculate the maximum drawdown for each
for datas in [data_1, data_2, data_3]:
    max_drawdown, start_index, end_index  = calculate_max_drawdown(datas['close'], window_size=100)

    start_date = data.iloc[start_index]['Date']
    end_date = data.iloc[end_index ]['Date']

    print(f"Max Drawdown: {max_drawdown * 100:.2f}%")
    print(f"Start Date: {start_date}")
    print(f"End Date: {end_date}")


Max Drawdown: -58.18%
Start Date: 2008-06-03 00:00:00
End Date: 2008-10-29 00:00:00
Max Drawdown: -50.56%
Start Date: 2015-06-12 00:00:00
End Date: 2015-09-15 00:00:00
Max Drawdown: -33.46%
Start Date: 2015-11-25 00:00:00
End Date: 2016-01-28 00:00:00


## 6 Growth


In [81]:

# Define a function to calculate the maximum drawdown
def calculate_max_growth(log_prices, window_size):

    
    max_growth = 0
    start_date = None
    end_date = None

    for i in range(len(log_prices) - window_size + 1):
        window = log_prices.iloc[i:i + window_size]
        min = window.iloc[0]

        for date, log_price in window.items():
            if log_price < min:
                min = log_price
            else:
                growth = (log_price - min) / min
                if growth > max_growth:
                    max_growth = growth
                    start_date = window.index[0]  # Start date is the first date of the window
                    end_date = date  # End date is the current date
                    # print(type(start_date),type(end_date))

    return max_growth, start_date, end_date

In [82]:
# Split the data into 3 parts based on the selected two indices
data_1 = data.loc[:top_2_peaks_indices[0]]
data_2 = data.loc[top_2_peaks_indices[0]:top_2_peaks_indices[1]]
data_3 = data.loc[top_2_peaks_indices[1]:]

for datas in [data_1, data_2, data_3]:
    max_growth, start_index, end_index  = calculate_max_growth(datas['close'], window_size=100)

    start_date = data.iloc[start_index]['Date']
    end_date = data.iloc[end_index ]['Date']

    print(f"Max growth: {max_growth * 100:.2f}%")
    print(f"Start Date: {start_date}")
    print(f"End Date: {end_date}")



Max growth: 184.72%
Start Date: 2006-12-22 00:00:00
End Date: 2007-05-29 00:00:00
Max growth: 38.79%
Start Date: 2015-06-30 00:00:00
End Date: 2015-11-25 00:00:00
Max growth: 42.76%
Start Date: 2018-11-07 00:00:00
End Date: 2019-04-04 00:00:00
