In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Visulaisation and EDA

In [2]:
# Load the dataset
data = pd.read_csv("Data.csv")
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data.csv'

In [None]:
data.columns

In [None]:
data.isnull().sum()

There is non-availability of data related to trades for every Stock before 2012. We have all values available for all stocks after 2012.

In [None]:
data = data[data.Date >= '2012-01-01']
data.isnull().sum()

No NULL values present henceforth

In [None]:
Tickers = data["Symbol"].unique().tolist()
print(Tickers)
print(f"Total Number of Stocks: {len(Tickers)}")

There are stock data of 53 stocks present here. But since we are working on NIFTY_50 data, there should only have been 49 stocks (We dont have data on INFRATEL). There are 53 because name of 4 Tickers got changed between the time period of 2012-2021 hence we are receiving these extra 4 Symbols/Tickers. The changed ones are namely: MUNDRAPORT, UNIPHOS, SESAGOA, SSLT

MUNDRAPORT and UNIPHOS were changed due to rebranding of the companies and hence data under these tickers can be used as data under their new Tickers (ADANIPORT and UPL).

In cases like SESAGOA and SSLT, where the companies merged and became part of Vedanta Limited (VEDL), the business fundamentals might have changed. This means that historical data before the merger might not fully represent the current company's operations. So, data under these tickers should be ignored.

In [None]:
symbol_mapping = {
    'MUNDRAPORT': 'ADANIPORTS',
    'UNIPHOS': 'UPL',
}
data['Symbol'] = data['Symbol'].replace(symbol_mapping)
data = data[~data["Symbol"].isin(['SESAGOA', 'SSLT'])]

Tickers = data["Symbol"].unique().tolist()
print(Tickers)
print(f"Total Number of Stocks: {len(Tickers)}")

In [None]:
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True, format='mixed')
data.index = data.Date
data = data.drop(columns=["Date"], axis=1)
data.head()

In [None]:
import math

# Number of symbols
num_symbols = len(Tickers)

# Set up the grid layout (3 columns)
rows = math.ceil(num_symbols / 6)
cols = 6

# Create a figure with subplots
fig, axes = plt.subplots(rows, cols, figsize=(36, 4 * rows))
fig.suptitle('Closing Prices for Different Symbols', fontsize=20)

# Flatten axes array for easy indexing
axes = axes.flatten()

# Plot closing prices for each symbol
for i, symbol in enumerate(Tickers):
    # Filter data for the current symbol
    symbol_data = data[data["Symbol"] == symbol]

    # Plot on the corresponding subplot
    axes[i].plot(symbol_data.index.tolist(), symbol_data["Close"], color='blue', linestyle='-', linewidth=1)

    # Adding titles and labels to the current subplot
    axes[i].set_title(f'{symbol} Closing Prices', fontsize=10)
    axes[i].set_xlabel('Date', fontsize=8)
    axes[i].set_ylabel('Closing Price (INR)', fontsize=8)

    # Rotate the x-axis labels for better readability
    axes[i].tick_params(axis='x', rotation=45)

    # Format the x-axis date labels
    axes[i].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))

    # Adding gridlines
    axes[i].grid(visible=True, linestyle='--', alpha=0.5)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Leave space for the main title
plt.show()


Here it can be seen that there are trends in stock with a sudden decrease in close price. These happen due to external factors which we are not going to model here. So, we might need to have a different eda/ model for these trends than that of regular trends.

In [None]:
# Initialize dictionaries to store categorized symbols
stable_trend = []
sudden_changes = []

# Categorization thresholds
change_threshold = 0.3 # For sudden change detection

for symbol in Tickers:
    # Get the closing prices for the symbol
    symbol_data = data[data["Symbol"] == symbol]["Close"].values

    # Calculate percentage change
    percent_changes = np.diff(symbol_data) / symbol_data[:-1]

    # Detect sudden changes
    sudden_change_detected = np.any(np.abs(percent_changes) > change_threshold)

    # Categorize based on criteria
    if sudden_change_detected:
        sudden_changes.append(symbol)
    else:
        stable_trend.append(symbol)

# Output results
print("Stable Trend Symbols:", stable_trend)
print("Sudden Change Symbols:", sudden_changes)

Here, we are going to perform EDA on one symbol of each trend intsead of a different sets of EDA for all Symbols assuming that all other symbols of same group will behave in similar fashion under EDA.

In [None]:
data_adaniports = data[data["Symbol"] == 'ADANIPORTS']
data_asianpaints = data[data["Symbol"] == 'ASIANPAINT']

print("ADANIPOTS:")
display(data_adaniports)
print("ASIANPAINT:")
display(data_asianpaints)

In [None]:
print("ADANIPORTS")
display(data_adaniports.info())

print("ASIANPAINT")
display(data_asianpaints.info())