<a href="https://colab.research.google.com/github/SanjayBhargavKudupudi/DATA_MINING/blob/main/spatio%20temporal/spatio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
Path = "/content/drive/MyDrive/DATA_MINING_DATASETS/GTFS_Data.csv"


# Load the dataset

data = pd.read_csv(Path)

In [None]:
# Statistical summary of the data
summary_stats = data.describe(include='all')

# Checking for missing values
missing_values = data.isnull().sum()

(summary_stats, missing_values)


(        stop_id_from    stop_id_to  \
 count   66913.000000  66913.000000   
 unique           NaN           NaN   
 top              NaN           NaN   
 freq             NaN           NaN   
 mean    33078.882683  33096.450286   
 std     11893.792935  11873.818983   
 min       386.000000    386.000000   
 25%     33494.000000  33496.000000   
 50%     38784.000000  38785.000000   
 75%     39280.000000  39284.000000   
 max     40516.000000  40516.000000   
 
                                                   trip_id arrival_time  \
 count                                               66913        66644   
 unique                                               5349         5905   
 top     NORMAL_360_Mhalungegaon To Alandi Via Balewadi...     14:00:00   
 freq                                                   57          162   
 mean                                                  NaN          NaN   
 std                                                   NaN          NaN   
 min 

In [None]:
import numpy as np

# Addressing infinite values in the 'speed' column
# Replacing infinite values with NaN
data['speed'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Now let's deal with the missing values.
# For simplicity, we'll replace missing numeric values with the median and missing categorical values with the mode.
# This is a basic approach and can be refined later based on further analysis.

# Numeric columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Checking if there are any remaining missing values
remaining_missing_values = data.isnull().sum()

remaining_missing_values


stop_id_from            0
stop_id_to              0
trip_id                 0
arrival_time            0
time                    0
speed                   0
Number_of_trips         0
SRI                     0
Degree_of_congestion    0
dtype: int64

In [None]:
from datetime import datetime

# Converting 'arrival_time' to datetime format and extracting hour, minute, and second
data['arrival_hour'] = pd.to_datetime(data['arrival_time'], format='%H:%M:%S').dt.hour
data['arrival_minute'] = pd.to_datetime(data['arrival_time'], format='%H:%M:%S').dt.minute
data['arrival_second'] = pd.to_datetime(data['arrival_time'], format='%H:%M:%S').dt.second

# Dropping the original 'arrival_time' column
data = data.drop('arrival_time', axis=1)

# Converting 'SRI' to numeric, as it seems to be a numeric value in string format
data['SRI'] = pd.to_numeric(data['SRI'], errors='coerce')

# Filling any NaNs generated during this conversion with the median (basic approach)
data['SRI'] = data['SRI'].fillna(data['SRI'].median())

# Check the first few rows of the updated dataframe
data.head()


Unnamed: 0,stop_id_from,stop_id_to,trip_id,time,speed,Number_of_trips,SRI,Degree_of_congestion,arrival_hour,arrival_minute,arrival_second
0,36156,38709,NORMAL_333_Pune Station To Hinjawadi Maan Pha...,0.027222,14.479565,9.0,-0.408163,Very smooth,9,13,54
1,36156,38709,NORMAL_115P_Pune Station to Hinjawadi Phase 3_...,0.032222,12.232736,9.0,1.206897,Smooth,9,3,1
2,36156,38709,NORMAL_100_Ma Na Pa to Hinjawadi Maan Phase 3_...,0.058333,6.75713,9.0,5.142857,Heavy congestion,9,15,0
3,36156,38709,NORMAL_VJR5_Ma Na Pa To Mukai Chowk_Up-0905_0,0.033611,11.727251,9.0,1.570248,Smooth,9,5,0
4,36156,38709,NORMAL_100_Ma Na Pa to Hinjawadi Maan Phase 3_...,0.058333,6.75713,9.0,5.142857,Heavy congestion,9,0,0


In [None]:
# Feature Engineering: Categorizing 'arrival_hour' into different parts of the day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data['time_of_day'] = data['arrival_hour'].apply(categorize_time_of_day)

# Correlation Analysis among numerical variables
correlation_matrix = data.select_dtypes(include=['float64', 'int64']).corr()

# Displaying the new feature and the correlation matrix
data[['arrival_hour', 'time_of_day']].head(), correlation_matrix


(   arrival_hour time_of_day
 0             9     Morning
 1             9     Morning
 2             9     Morning
 3             9     Morning
 4             9     Morning,
                  stop_id_from  stop_id_to      time     speed  \
 stop_id_from         1.000000    0.393264  0.005545  0.017108   
 stop_id_to           0.393264    1.000000 -0.000351  0.008144   
 time                 0.005545   -0.000351  1.000000 -0.007297   
 speed                0.017108    0.008144 -0.007297  1.000000   
 Number_of_trips      0.092999    0.086616 -0.020755 -0.045733   
 SRI                 -0.012179    0.000154  0.007849 -0.474662   
 arrival_hour         0.001807   -0.000075  0.006934 -0.015473   
 arrival_minute      -0.001442   -0.001314 -0.035027  0.006566   
 arrival_second      -0.004316    0.000237 -0.051070  0.013809   
 
                  Number_of_trips       SRI  arrival_hour  arrival_minute  \
 stop_id_from            0.092999 -0.012179      0.001807       -0.001442   
 stop_id_

In [None]:
# Categorizing 'arrival_hour' into different parts of the day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data['time_of_day'] = data['arrival_hour'].apply(categorize_time_of_day)


In [None]:
# Calculate the correlation matrix
correlation_matrix = data.select_dtypes(include=['float64', 'int64']).corr()
print(correlation_matrix)


                 stop_id_from  stop_id_to      time     speed  \
stop_id_from         1.000000    0.393264  0.005545  0.017108   
stop_id_to           0.393264    1.000000 -0.000351  0.008144   
time                 0.005545   -0.000351  1.000000 -0.007297   
speed                0.017108    0.008144 -0.007297  1.000000   
Number_of_trips      0.092999    0.086616 -0.020755 -0.045733   
SRI                 -0.012179    0.000154  0.007849 -0.474662   
arrival_hour         0.001807   -0.000075  0.006934 -0.015473   
arrival_minute      -0.001442   -0.001314 -0.035027  0.006566   
arrival_second      -0.004316    0.000237 -0.051070  0.013809   

                 Number_of_trips       SRI  arrival_hour  arrival_minute  \
stop_id_from            0.092999 -0.012179      0.001807       -0.001442   
stop_id_to              0.086616  0.000154     -0.000075       -0.001314   
time                   -0.020755  0.007849      0.006934       -0.035027   
speed                  -0.045733 -0.474662   

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encoding for categorical variables
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(data[['time_of_day', 'Degree_of_congestion', 'trip_id']])





In [None]:
from sklearn.preprocessing import StandardScaler

# Selecting numerical columns to scale
num_cols = ['time', 'speed', 'Number_of_trips', 'SRI', 'arrival_hour', 'arrival_minute', 'arrival_second']

scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])


In [None]:
from sklearn.model_selection import train_test_split

# Assuming we are predicting 'Degree_of_congestion'
X = data.drop('Degree_of_congestion', axis=1)  # Features
y = data['Degree_of_congestion']               # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the regression model
regressor = RandomForestRegressor(random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Evaluate the model
score = regressor.score(X_test, y_test)
print(f"Test Score: {score}")



Test Score: 0.8892231874349728


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Load the dataset again
file_path = '/content/drive/MyDrive/DATA_MINING_DATASETS/GTFS_Data.csv'
data = pd.read_csv(file_path)

# Selecting numeric columns for analysis
numeric_cols = ['time', 'speed', 'Number_of_trips', 'SRI']
data_numeric = data[numeric_cols]

# Replacing non-numeric and infinite values with NaN
data_numeric = data_numeric.apply(pd.to_numeric, errors='coerce')
data_numeric = data_numeric.replace([np.inf, -np.inf], np.nan)

# Replacing missing values with the median of each column
data_numeric = data_numeric.fillna(data_numeric.median())

# Standardizing the numeric columns
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# Anomaly Detection
iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
anomalies = iso_forest.fit_predict(data_scaled)
data['anomaly'] = anomalies  # adding anomalies to original data

# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(data_scaled)
data['cluster'] = clusters  # adding cluster labels to original data

# Summary of Anomaly Detection and Clustering
anomaly_count = (data['anomaly'] == -1).sum()
cluster_counts = data['cluster'].value_counts()

anomaly_count, cluster_counts, data.head()




(669,
 0    66699
 2      147
 1       67
 Name: cluster, dtype: int64,
    stop_id_from  stop_id_to  \
 0         36156       38709   
 1         36156       38709   
 2         36156       38709   
 3         36156       38709   
 4         36156       38709   
 
                                              trip_id arrival_time      time  \
 0  NORMAL_333_Pune Station To  Hinjawadi Maan Pha...     09:13:54  0.027222   
 1  NORMAL_115P_Pune Station to Hinjawadi Phase 3_...     09:03:01  0.032222   
 2  NORMAL_100_Ma Na Pa to Hinjawadi Maan Phase 3_...     09:15:00  0.058333   
 3      NORMAL_VJR5_Ma Na Pa To Mukai Chowk_Up-0905_0     09:05:00  0.033611   
 4  NORMAL_100_Ma Na Pa to Hinjawadi Maan Phase 3_...     09:00:00  0.058333   
 
        speed  Number_of_trips          SRI Degree_of_congestion  anomaly  \
 0  14.479565              9.0  -0.40816322          Very smooth        1   
 1  12.232736              9.0    1.2068965               Smooth        1   
 2   6.757130        

In [6]:
!pip install h2o


Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=8edfd5ac809ae6ddeaf02f3d6b0c535b1cabdb6dbb76ad41fbe58cd90c3ad1e4
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [7]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1drzpa2h
  JVM stdout: /tmp/tmp1drzpa2h/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp1drzpa2h/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_unknownUser_q9pwxr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [9]:
# Load data into H2O
data_h2o = h2o.import_file('/content/drive/MyDrive/DATA_MINING_DATASETS/GTFS_Data.csv')

# Selecting the target and predictors
target = 'time'  # Change this to your target variable
predictors = data_h2o.columns
predictors.remove(target)

# Splitting the dataset
train, test = data_h2o.split_frame(ratios=[.8], seed=42)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
automl = H2OAutoML(max_models=20, seed=42, max_runtime_secs=1000, sort_metric="RMSE")
automl.train(x=predictors, y=target, training_frame=train)


AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,95.0,95.0,331924.0,10.0,10.0,10.0,76.0,348.0,164.69473

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.0436459,0.0059933,0.0407405,0.0400079,0.0442235,0.0394221,0.0538356
mean_residual_deviance,0.3938021,0.0786109,0.4441419,0.3336633,0.365252,0.3207338,0.5052196
mse,0.3938021,0.0786109,0.4441419,0.3336633,0.365252,0.3207338,0.5052196
r2,0.2834274,0.0714586,0.3531057,0.2255257,0.1950776,0.2945089,0.3489192
residual_deviance,0.3938021,0.0786109,0.4441419,0.3336633,0.365252,0.3207338,0.5052196
rmse,0.6251116,0.0616195,0.6664397,0.5776359,0.6043608,0.5663337,0.710788
rmsle,0.1077859,0.0061517,0.1042321,0.1075663,0.1130898,0.0996249,0.1144163

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2023-11-01 03:19:40,41.593 sec,0.0,0.7485915,0.0536183,0.5603892
,2023-11-01 03:19:40,41.945 sec,5.0,0.6948076,0.0479556,0.4827576
,2023-11-01 03:19:40,42.243 sec,10.0,0.6676544,0.0460273,0.4457623
,2023-11-01 03:19:41,42.545 sec,15.0,0.6570296,0.0446672,0.4316879
,2023-11-01 03:19:41,42.833 sec,20.0,0.6509404,0.0445247,0.4237234
,2023-11-01 03:19:41,43.144 sec,25.0,0.6481805,0.0447014,0.420138
,2023-11-01 03:19:42,43.533 sec,30.0,0.6456834,0.0439,0.4169071
,2023-11-01 03:19:42,44.085 sec,35.0,0.6440936,0.0440812,0.4148566
,2023-11-01 03:19:43,44.625 sec,40.0,0.6431918,0.0436896,0.4136957
,2023-11-01 03:19:43,45.156 sec,45.0,0.6423745,0.0432547,0.412645

variable,relative_importance,scaled_importance,percentage
arrival_time,32640.6074219,1.0,0.5661036
speed,10195.7109375,0.3123628,0.1768297
SRI,6141.2304688,0.1881469,0.1065107
trip_id,4695.7749023,0.143863,0.0814413
Number_of_trips,2046.748291,0.0627056,0.0354979
stop_id_to,957.9515991,0.0293485,0.0166143
stop_id_from,754.7350464,0.0231226,0.0130898
Degree_of_congestion,225.6082916,0.0069119,0.0039128
