In [None]:
import pandas as pd # Importing pandas for manupilation
import numpy as np # Importing numpy for numerical computations
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Step 1: Load Dataset (CSV file will be created separately)
df = pd.read_csv('/content/MTA_Daily_Ridership.csv') # Reading dataset into a pandas DataFrame
print("Original Data:")
print(df.head()) #Display the first few rows of the data set

Original Data:
         Date  Subways: Total Estimated Ridership  \
0  2020-03-01                             2212965   
1  2020-03-02                             5329915   
2  2020-03-03                             5481103   
3  2020-03-04                             5498809   
4  2020-03-05                             5496453   

   Subways: % of Comparable Pre-Pandemic Day  \
0                                         97   
1                                         96   
2                                         98   
3                                         99   
4                                         99   

   Buses: Total Estimated Ridership  Buses: % of Comparable Pre-Pandemic Day  \
0                            984908                                       99   
1                           2209066                                       99   
2                           2228608                                       99   
3                           2177165                      

In [None]:
print(df.tail()) # enable view the last five rows of the data set

            Date  Subways: Total Estimated Ridership  \
1701  2024-10-27                             2215973   
1702  2024-10-28                             3985381   
1703  2024-10-29                             4391065   
1704  2024-10-30                             4450028   
1705  2024-10-31                             4027166   

      Subways: % of Comparable Pre-Pandemic Day  \
1701                                         90   
1702                                         69   
1703                                         76   
1704                                         77   
1705                                         70   

      Buses: Total Estimated Ridership  \
1701                            737246   
1702                           1460653   
1703                           1528993   
1704                           1553250   
1705                           1297891   

      Buses: % of Comparable Pre-Pandemic Day  \
1701                                       74   
1702 

In [None]:
# Step 2: Handling Missing Values
print("\nChecking for missing values:")
print(df.isnull().sum()) # Count missing values per column
df = df.dropna() # Drop rows with missing values (Alternative: df.fillna(value) to fill missing values)


Checking for missing values:
Date                                                       0
Subways: Total Estimated Ridership                         0
Subways: % of Comparable Pre-Pandemic Day                  0
Buses: Total Estimated Ridership                           0
Buses: % of Comparable Pre-Pandemic Day                    0
LIRR: Total Estimated Ridership                            0
LIRR: % of Comparable Pre-Pandemic Day                     0
Metro-North: Total Estimated Ridership                     0
Metro-North: % of Comparable Pre-Pandemic Day              0
Access-A-Ride: Total Scheduled Trips                       0
Access-A-Ride: % of Comparable Pre-Pandemic Day            0
Bridges and Tunnels: Total Traffic                         0
Bridges and Tunnels: % of Comparable Pre-Pandemic Day      0
Staten Island Railway: Total Estimated Ridership           0
Staten Island Railway: % of Comparable Pre-Pandemic Day    0
dtype: int64


In [None]:
# Step 2: Handling Missing Values
print("\nChecking for missing values:")
print(df.isnull().sum()) # Count missing values per column
df = df.dropna() # Drop rows with missing values (Alternative: df.fillna(value) to fill missing values)


Checking for duplicate:
0


In [None]:
# Step 4: Encoding Categorical Variables
print("\nEncoding categorical variables:")
categorical_cols_ = df.select_dtypes(include=['object']).columns # Selecting categorical columns
label_encoders = {}
for col in categorical_cols_:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col]) # Apply Label Encoding
  label_encoders[col] = le
  print(df.head()) # Display tranformed dataset


Encoding categorical variables:
   Date  Subways: Total Estimated Ridership  \
0     0                             2212965   
1     1                             5329915   
2     2                             5481103   
3     3                             5498809   
4     4                             5496453   

   Subways: % of Comparable Pre-Pandemic Day  \
0                                         97   
1                                         96   
2                                         98   
3                                         99   
4                                         99   

   Buses: Total Estimated Ridership  Buses: % of Comparable Pre-Pandemic Day  \
0                            984908                                       99   
1                           2209066                                       99   
2                           2228608                                       99   
3                           2177165                                       9

In [None]:
# Step 5: Featuring Scaling
print("\nApplying feature scalling:")
numeric_cols = df.select_dtypes(include=[np.number]).columns # Selecting numeric columns
scaler = StandardScaler() # Initializing standard scaler
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) # standardizing numerical features
print(df.head()) # Display scaled dataset


Applying feature scalling:
       Date  Subways: Total Estimated Ridership  \
0 -1.731036                           -0.278838   
1 -1.729005                            2.656496   
2 -1.726975                            2.798875   
3 -1.724944                            2.815549   
4 -1.722914                            2.813331   

   Subways: % of Comparable Pre-Pandemic Day  \
0                                   2.096454   
1                                   2.045984   
2                                   2.146924   
3                                   2.197394   
4                                   2.197394   

   Buses: Total Estimated Ridership  Buses: % of Comparable Pre-Pandemic Day  \
0                         -0.049881                                 2.297177   
1                          2.730711                                 2.297177   
2                          2.775100                                 2.297177   
3                          2.658250                     

In [None]:
# Step 6: Spliting Dataset into training and Testing Sets
print("\nSplitting dataset into training and testing sts:")
x = df.drop(columns=['subways_of_comparable_pre-pandemiic_day_1']) # Assuming 'subways_1' is the target variable
y = df['subways_1subways_of_comparable_pre-pandemiic_day_1']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(f"Training set size:{x_train.shape}, Testing set size:{x_test.shape}")


Splitting dataset into training and testing sts:


NameError: name 'df' is not defined