In [1]:
#Load the dataset
import pandas as pd
df=pd.read_excel("C:/Users/Srividhya/Desktop/ClosePriceData.xlsx")
print(df)




         Date    close 
0   2023-01-03  4241.85
1   2023-01-02  4292.15
2   2022-12-30  4307.45
3   2022-12-29  4348.30
4   2022-12-28  4385.95
..         ...      ...
245 2022-01-07  3737.35
246 2022-01-06  3675.10
247 2022-01-05  3659.70
248 2022-01-04  3638.45
249 2022-01-03  3617.55

[250 rows x 2 columns]


In [7]:
#After importing the excel data, let's enter into preprocessing
#1) Checking for missing values
#2) Conversion of date column to date time format for easy indexing, sorting and calculations.

missing_values=df.isna().sum()
missing_values
df.columns
df.columns=df.columns.str.strip()
df.columns
df['Date']=pd.to_datetime(df['Date'])
print(df['Date'])



0     2023-01-03
1     2023-01-02
2     2022-12-30
3     2022-12-29
4     2022-12-28
         ...    
245   2022-01-07
246   2022-01-06
247   2022-01-05
248   2022-01-04
249   2022-01-03
Name: Date, Length: 250, dtype: datetime64[ns]


In [9]:
#3) Determine the descriptive statistics such as mean, median, std etc.
descriptive=df['close'].describe()
print(descriptive)

count     250.000000
mean     3672.922400
std       344.323323
min      3094.500000
25%      3395.262500
50%      3656.600000
75%      3797.562500
max      4526.500000
Name: close, dtype: float64


In [15]:
#4) Test whether the data is stationary. To proceed with that, let us apply Augmented Dickey-Fuller test.
#Augmented Dickey Fuller test is a statistical test to determine the presence of unit root indicating non-stationarity.  
#It helps us analyse if differencing the data is necessary to make it suitable for various time series analyses, such as forecasting.

from statsmodels.tsa.stattools import adfuller
Close=df['close']
result=adfuller(Close)
print(result)
print("ADF Statistic:", result[0])
print("p-value:", result[1])
print("Critical Values:", result[4])
if result[1] <= 0.05:
    print("p-value <= 0.05: Null hypothesis is rejected,the data is stationary")
else:
    print("p-value > 0.05,Null hypothesis is accepted, the data is non-stationary")


(-1.7861277271242983, 0.38737031564359997, 0, 249, {'1%': -3.4568881317725864, '5%': -2.8732185133016057, '10%': -2.5729936189738876}, 2518.3507697069617)
ADF Statistic: -1.7861277271242983
p-value: 0.38737031564359997
Critical Values: {'1%': -3.4568881317725864, '5%': -2.8732185133016057, '10%': -2.5729936189738876}
p-value > 0.05,Null hypothesis is accepted, the data is non-stationary
1      50.30
2      15.30
3      40.85
4      37.65
5     -16.70
       ...  
245   -30.95
246   -62.25
247   -15.40
248   -21.25
249   -20.90
Name: close, Length: 249, dtype: float64


In [19]:
#Since the data is non-stationary, we should apply ADF test to make it stationary as it is an important assumption to build an ARIMA model.

diff = Close.diff().dropna()
print(diff)
result_diff = adfuller(diff)
print(result_diff)
print("ADF Statistic:", result_diff[0])
print("p-value:", result_diff[1])
if result_diff[1] <= 0.05:
    print("p-value <= 0.05: Reject the null hypothesis, differenced data is likely stationary")
else:
    print("p-value > 0.05: Fail to reject the null hypothesis, differenced data is likely non-stationary")
    

1      50.30
2      15.30
3      40.85
4      37.65
5     -16.70
       ...  
245   -30.95
246   -62.25
247   -15.40
248   -21.25
249   -20.90
Name: close, Length: 249, dtype: float64
(-14.81948657697191, 1.9737024966461387e-27, 0, 248, {'1%': -3.4569962781990573, '5%': -2.8732659015936024, '10%': -2.573018897632674}, 2515.1094975378087)
ADF Statistic: -14.81948657697191
p-value: 1.9737024966461387e-27
p-value <= 0.05: Reject the null hypothesis, differenced data is likely stationary


In [24]:
#Now that the data has became stationary after differencing, let us export the data to carry out the analysis
diff
diff.to_csv('differenced.csv',index=False)
diff.to_csv('C:/Users/Srividhya/Desktop/differenced.csv')
