In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Symbol          1128 non-null   object 
 1   Date            1128 non-null   object 
 2   Open            1128 non-null   float64
 3   High            1128 non-null   float64
 4   Low             1128 non-null   float64
 5   Close           1128 non-null   float64
 6   Percent Change  1128 non-null   object 
 7   Volume          1128 non-null   object 
dtypes: float64(4), object(4)
memory usage: 70.6+ KB


In [4]:
df.head()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Percent Change,Volume
0,ADBL,2024-08-29,390.0,390.0,374.0,376.4,-1.59 %,171319.0
1,ADBL,2024-08-28,382.0,394.9,378.0,382.5,0.13 %,173834.0
2,ADBL,2024-08-27,397.0,397.0,379.0,382.0,-4.26 %,316327.0
3,ADBL,2024-08-25,402.8,415.0,398.2,399.0,-2.92 %,280315.0
4,ADBL,2024-08-22,411.0,420.2,410.3,411.0,-0.94 %,184187.0


In [5]:
df.describe()

Unnamed: 0,Open,High,Low,Close
count,1128.0,1128.0,1128.0,1128.0
mean,366.019309,370.556472,360.247163,364.908599
std,95.955139,97.289997,94.148392,95.700882
min,225.0,225.0,223.0,223.4
25%,266.6,269.0,260.0,263.0
50%,374.0,382.0,368.0,374.5
75%,435.0,440.0,428.0,433.0
max,604.8,620.0,590.0,597.0


In [7]:
df.isnull().sum()

Symbol            0
Date              0
Open              0
High              0
Low               0
Close             0
Percent Change    0
Volume            0
dtype: int64

Convert the data types


In [9]:
# Replace non-numeric values in 'Percent Change' and 'Volume'
df['Percent Change'] = df['Percent Change'].replace('-', pd.NA).str.replace('%', '').str.replace(',', '')
df['Volume'] = df['Volume'].replace('-', pd.NA).str.replace(',', '')

In [10]:
# Convert to numeric
df['Percent Change'] = pd.to_numeric(df['Percent Change'], errors='coerce') / 100
df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')


In [11]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Confirm the changes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Symbol          1128 non-null   object        
 1   Date            1128 non-null   datetime64[ns]
 2   Open            1128 non-null   float64       
 3   High            1128 non-null   float64       
 4   Low             1128 non-null   float64       
 5   Close           1128 non-null   float64       
 6   Percent Change  1127 non-null   float64       
 7   Volume          1128 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 70.6+ KB


Feature Engineering

In [12]:
# Example: Create a new feature for price range
df['Price Range'] = df['High'] - df['Low']


Normalize Data

In [13]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Define feature columns
features = ['Open', 'High', 'Low', 'Close', 'Percent Change', 'Volume', 'Price Range']

# Fit and transform the feature columns
df[features] = scaler.fit_transform(df[features])


Split the dataset

In [14]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df[features]  # Features
y = df['Close']   # Target variable (example)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Saving the cleaned data

In [15]:
# Save the cleaned data to a new CSV file
df.to_csv('cleaned_data.csv', index=False)

# Confirm the saved file
print("Data cleaned and saved to 'cleaned_data.csv'")


Data cleaned and saved to 'cleaned_data.csv'
