In [None]:
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("Tesla_stock_data.csv")

# Step 2 : Explore data
# Gives the info of datatypes and more
print("\nDataset Info:")
print(df.info())

# Description of dataset
print("\nDescription of dataset :")
print(df.describe())

# Count of total null values present in each cloumn
print("\nMissing Values:")
print(df.isnull().sum())

# Step 3 : Handle missing data (No missing data in this dataset)
# Based on the output of df.isnull().sum(), there are no missing values in this dataset.
# Therefore, no missing data handling is needed for this specific dataset.

# Step 4 : Handle Categorical Variables (Only 'Date' is object type, but it's not a categorical feature for modeling)
# The 'Date' column is of object type, but it represents dates, not categorical features for direct encoding.
# If date-related features are needed, they should be extracted from the 'Date' column (e.g., year, month, day).
# For now, we won't perform categorical encoding as there are no suitable categorical columns for typical machine learning models in this dataset.

# Step 5 : Apply Normalization and Standardization
# Identify numerical columns
# Exclude 'Date' column as it's not numerical for scaling
numerical_cols = df.select_dtypes(include = ['int64', 'float64']).columns.tolist()
print("\nNumerical Columns:", numerical_cols)

scaler = StandardScaler();
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("\nAfter Scaling (StandardScaler):")
print(df[numerical_cols].head())

# Step 6 : Split the dataset
# To split the dataset for time series data like stock prices,
# a simple random split is usually not appropriate as it disrupts the temporal order.
# A common approach is to split based on time.
# However, without a defined target variable for a specific task (e.g., predicting 'Close' price),
# splitting into features (X) and target (Y) and then into train/test sets
# as done in the original code is not applicable for this dataset in its current form.
# If you have a specific prediction task in mind, please define the target variable.

# The original code was trying to split based on a 'survived' column which does not exist.
# Removing the split section as it's not relevant to the current dataset and context.

# If you intend to use this data for time series forecasting,
# you would typically prepare the data differently,
# for example, creating lagged features or using time-aware splitting methods.

# For now, the code will perform data exploration, check for missing values,
# and apply StandardScaler to the numerical columns.


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    3798 non-null   object 
 1   Close   3798 non-null   float64
 2   High    3798 non-null   float64
 3   Low     3798 non-null   float64
 4   Open    3798 non-null   float64
 5   Volume  3798 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 178.2+ KB
None

Description of dataset :
             Close         High          Low         Open        Volume
count  3798.000000  3798.000000  3798.000000  3798.000000  3.798000e+03
mean     90.559224    92.608116    88.425626    90.587811  9.709981e+07
std     115.245077   117.954878   112.476077   115.345283  7.660983e+07
min       1.053333     1.108667     0.998667     1.076000  1.777500e+06
25%      12.577667    12.806167    12.289500    12.574500  5.055585e+07
50%      19.046000    19.430333    18.700001    19.00033