In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import calendar
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [8]:
# load the data set using pandas
data=pd.read_csv('E:\Excelr-P269-Internship-Project-Group2-OIl-Price-Prediction-And-Forecasting-Using-Python\oil_prices_yahoo1.csv')

In [9]:
# covertnig the date column to datetime using pandas
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

# creating a data frame using pandas
df=pd.DataFrame(data)

# renaming the column to price
df.rename(columns = {'Close':'Price'}, inplace = True)

In [10]:
print(df.head())
print(df.tail())

        Date      Price
0 2000-08-23  32.049999
1 2000-08-24  31.629999
2 2000-08-25  32.049999
3 2000-08-28  32.869999
4 2000-08-29  32.720001
           Date      Price
5746 2023-07-17  74.150002
5747 2023-07-18  75.750000
5748 2023-07-19  75.349998
5749 2023-07-20  75.629997
5750 2023-07-21  77.070000


In [11]:
# info function gives us the raw information of data such as any null values, data types e.t.c.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5751 entries, 0 to 5750
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    5751 non-null   datetime64[ns]
 1   Price   5751 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 90.0 KB


In [12]:
df.shape

(5751, 2)

In [13]:
# checking for the null values
df.isnull().sum()

Date     0
Price    0
dtype: int64

In [14]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

def summarize_time_series(data):
    # Calculate summary statistics
    mean = np.mean(data)
    median = np.median(data)
    min_value = np.min(data)
    max_value = np.max(data)
    std_dev = np.std(data)

    # Perform Dickey-Fuller test to check for stationarity
    result = adfuller(data)
    adf_stat = result[0]
    p_value = result[1]
    critical_values = result[4]

    # Print summary statistics
    print('Summary Statistics:')
    print('Mean: {}'.format(mean))
    print('Median: {}'.format(median))
    print('Minimum: {}'.format(min_value))
    print('Maximum: {}'.format(max_value))
    print('Standard Deviation: {}'.format(std_dev))

    # Print Dickey-Fuller test results
    print('\nDickey-Fuller Test Results:')
    print('ADF Statistic: {}'.format(adf_stat))
    print('p-value: {}'.format(p_value))
    for key, value in critical_values.items():
        print('{}: {}'.format(key, value))

# Example usage
summarize_time_series(df['Price'])


Summary Statistics:
Mean: 63.85376977713248
Median: 61.540000915527344
Minimum: -37.630001068115234
Maximum: 145.2899932861328
Standard Deviation: 25.586500058183233

Dickey-Fuller Test Results:
ADF Statistic: -2.696203550982578
p-value: 0.07469752769722504
1%: -3.431493547724062
5%: -2.862045338455472
10%: -2.567038989861576


the ADF ("Augmented Dickey-Fuller") statistic is -2.696204 and the p-value is 0.074698. The critical values for the test at the 1%, 5%, and 10% significance levels are -3.431, -2.862, and -2.567, respectively. This means that if the ADF statistic is less than (more negative than) the critical value at a given significance level, we can reject the null hypothesis and conclude that the time series is stationary at that significance level.

In [None]:
In addition to statistical tests, you can also visually examine the time series data to check for stationarity. One way to do this is by plotting the time series and looking for trends or seasonality2. You can also split the time series into different partitions and compare the statistical properties of each partition2. If the mean, variance, and other statistical properties are stable across time, this suggests that the time series may be stationary.

It’s important to note that stationarity is an assumption of many time series forecasting models, so it’s important to determine if your data is stationary before building a model. If your data is non-stationary, you may need to transform it (e.g., by differencing or detrending) to make it stationary before building a model.

Based on the output you provided, the mean value of your time series data is **63.85**, the median is **61.54**, the minimum value is **-37.63**, and the maximum value is **145.29**. The standard deviation is **25.59**, which indicates that the data has a moderate amount of variability.

The Dickey-Fuller test results show an ADF statistic of **-2.70** and a p-value of **0.07**. The p-value is greater than the commonly used significance level of 0.05, which suggests that we cannot reject the null hypothesis that the time series has a unit root (i.e., it is non-stationary). However, the p-value is relatively close to 0.05, so you may want to perform additional tests or examine the data visually to determine if it is stationary or not.

The critical values for the test at the 1%, 5%, and 10% levels are **-3.43**, **-2.86**, and **-2.57**, respectively. Since the ADF statistic is greater than all of these critical values, this also suggests that we cannot reject the null hypothesis of non-stationarity.

Is there anything else you would like to know? 😊

In [16]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import kpss

def kpss_test(data):
    # Perform KPSS test
    result = kpss(data)
    kpss_stat = result[0]
    p_value = result[1]
    critical_values = result[3]

    # Print KPSS test results
    print('KPSS Test Results:')
    print('KPSS Statistic: {}'.format(kpss_stat))
    print('p-value: {}'.format(p_value))
    for key, value in critical_values.items():
        print('{}: {}'.format(key, value))

# Example usage
kpss_test(df['Price'])


KPSS Test Results:
KPSS Statistic: 2.3890250979796517
p-value: 0.01
10%: 0.347
5%: 0.463
2.5%: 0.574
1%: 0.739


Based on the output you provided, the KPSS statistic for your time series data is **2.39** and the p-value is **0.01**. Since the p-value is less than the commonly used significance level of 0.05, this suggests that we can reject the null hypothesis that the time series is trend-stationary. This means that your time series data may have a unit root and be non-stationary.

The critical values for the test at the 10%, 5%, 2.5%, and 1% levels are **0.347**, **0.463**, **0.574**, and **0.739**, respectively. Since the KPSS statistic is greater than all of these critical values, this also suggests that we can reject the null hypothesis of trend-stationarity.

If your time series data is non-stationary, you may need to transform it (e.g., by differencing or detrending) to make it stationary before building a forecasting model. There are several methods for transforming non-stationary data, and the appropriate method will depend on the characteristics of your data.

Is there anything else you would like to know? 😊

In [17]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller

def test_stationarity(data):
    # Perform Dickey-Fuller test
    result = adfuller(data)
    adf_stat = result[0]
    p_value = result[1]
    critical_values = result[4]

    # Print Dickey-Fuller test results
    print('Dickey-Fuller Test Results:')
    print('ADF Statistic: {}'.format(adf_stat))
    print('p-value: {}'.format(p_value))
    for key, value in critical_values.items():
        print('{}: {}'.format(key, value))

def make_stationary(data):
    # Calculate first difference
    diff = data.diff().dropna()

    # Test for stationarity
    print('\nFirst Difference:')
    test_stationarity(diff)

    # Detrend data
    detrended = data - data.rolling(window=12).mean()
    detrended = detrended.dropna()

    # Test for stationarity
    print('\nDetrended Data:')
    test_stationarity(detrended)

    # Take logarithm of data
    log_data = np.log(data)
    log_data = log_data.dropna()

    # Test for stationarity
    print('\nLogarithm of Data:')
    test_stationarity(log_data)

# Example usage
make_stationary(df['Price'])



First Difference:
Dickey-Fuller Test Results:
ADF Statistic: -12.47495528713085
p-value: 3.183489760467687e-23
1%: -3.431493547724062
5%: -2.862045338455472
10%: -2.567038989861576

Detrended Data:
Dickey-Fuller Test Results:
ADF Statistic: -9.906011501169225
p-value: 3.262152993179196e-17
1%: -3.431496756314084
5%: -2.862046756071205
10%: -2.5670397445004576

Logarithm of Data:
Dickey-Fuller Test Results:
ADF Statistic: -2.541675506706014
p-value: 0.10566061922346581
1%: -3.431492349126996
5%: -2.862044808892236
10%: -2.567038707959433
