# Objective 1 - Analysis of the Impact of Train Strikes on People Movement
### Feature Engineering

- André Novo
- Beatriz Paulino
- Catarina Brito
- Luís Pereira

#### Used Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### Filter the various months by parishes to be used

In [2]:
# get path 
root = os.getcwd()

# Define the folder containing the data
pasta_folder = 'data' 
pasta_path = os.path.join(root, pasta_folder)

# IDs of the parishes to filter the dataset
grid_ids_to_keep = [1758, 1759, 1699, 3563, 3564, 3565, 3319, 3320, 933, 934, 990, 75, 76, 3736, 3728, 3524, 3564, 3460, 3500, 3501, 698, 757, 758, 102, 103]

# List to store filtered DataFrames
filtered_dfs = []

csv_files = [file for file in os.listdir(pasta_path) if file.endswith('.csv')]

# Loop through each CSV file
for filename in csv_files:
    file_path = os.path.join(pasta_path, filename)
    df = pd.read_csv(file_path)
    df = df[df['Grid_ID'].isin(grid_ids_to_keep)]
    filtered_dfs.append(df)

# Concatenate all the filtered DataFrames into a single DataFrame
result_df = pd.concat(filtered_dfs, ignore_index=True)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


In [5]:
# Sort the DataFrame by the 'Datetime' column to ensure chronological order
result_df.sort_values(by='Datetime',inplace=True)

In [6]:
# Display the sorted DataFrame to verify the changes
result_df

Unnamed: 0.1,Unnamed: 0,Grid_ID,Datetime,C1,C2,C3,C4,C5,C6,C7,...,E2,E3,E4,E5,E6,E7,E8,E9,E10,C8
182616,9235296,3564,2023-01-01T00:00:00.000Z,17.639999,0.0,17.639999,0.0,0.0,20.84,0.0,...,0.0,0.0,0,0,,4,9.75,14,0.0,
167064,8602848,3320,2023-01-01T00:00:00.000Z,89.760002,4.65,82.099998,4.65,7.65,43.200001,0.0,...,0.0,0.0,0,0,,0,8.35,18,0.0,
138552,264384,103,2023-01-01T00:00:00.000Z,44.02,9.3,44.02,9.3,5.66,45.419998,0.0,...,0.0,0.0,0,0,,2,8.33,16,0.0,
141144,1806624,698,2023-01-01T00:00:00.000Z,35.860001,0.0,35.860001,0.0,0.0,13.96,0.0,...,0.0,0.0,0,0,,0,33.630001,177,0.0,
185208,9237888,3565,2023-01-01T00:00:00.000Z,65.599998,0.0,65.599998,0.0,0.0,3.6,0.0,...,0.0,0.0,0,0,,4,8.33,11,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258303,10170161,102,2023-06-30T23:45:00.000Z,140.059998,6.41,41.380001,3.21,117.860001,100.910004,6.41,...,54864.738281,10564.389648,4704575,331245,android:Total Traffic,0,36.400002,300,0.0,7.650000
258302,10170081,3320,2023-06-30T23:45:00.000Z,220.479996,21.870001,130.850006,8.17,136.110001,109.309998,14.58,...,4503.339844,3030.379883,327876,42856,android:Total Traffic,0,15.01,89,0.0,10.890000
258301,10169954,933,2023-06-30T23:45:00.000Z,84.779999,21.870001,35.209999,7.29,65.839996,79.989998,21.870001,...,718.929993,2609.149902,28119,87328,android:Total Traffic,0,8.83,34,0.0,20.629999
258313,10171818,698,2023-06-30T23:45:00.000Z,170.169998,10.5,63.25,3.21,103.480003,123.239998,10.5,...,1556.25,773.059998,204528,27545,android:Total Traffic,0,13.14,116,0.0,9.040000


#### Export to CSV

In [7]:
result_df.to_csv('data_complete.csv')

In [8]:
# Clear memory by deleting the DataFrame
del result_df

# Feature Engineering with data_complete

In [9]:
# Load the dataset with all months combined
data = pd.read_csv("data_complete.csv")

In [10]:
# Keep only the columns relevant to the objective 3
keep_cols = ["Grid_ID", "Datetime", "C1", "C2", "C3", "C4"]
data = data[keep_cols]

In [11]:
# Convert the 'Datetime' column to datetime format
data['Datetime'] = pd.to_datetime(data['Datetime'])

In [12]:
# Create 'Time' and 'Date' columns from the 'Datetime' column
data['Time'] = pd.to_datetime(data['Datetime'], format='%H:%M').dt.time
data['Date']=data['Datetime'].dt.date

### Creation of Strike Column
<p>Strike days in each month considered for analysis:</p>
<ul>
  <li>January -> 1-8 - Minimum Services</li>
  <li>February -> 2-21, 27 - Total Strike</li>
  <li>March -> 10 - Minimum Services</li>
  <li>March -> 11-17 - Partial Strike</li>
  <li>April -> 1-5, 7-30 - Partial Strike</li>
  <li>April -> 6 - Minimum Services</li>
  <li>May -> 31 - Minimum Services, 30 - Partial Strike</li>
  <li>June -> 5-30 - Partial Strike</li>
</ul>

In [13]:
# Create the 'Strike' column with default value 'No'
data['Strike'] = 'No'

#### Strike Column Categories:
- No - No strike
- MinS - Minimum Services (>=75% stoppage)
- Partial - Partial Strike
- Total - Total Strike

In [15]:
# Identify the strike days and their types

# January
data.loc[(data['Datetime'].dt.month == 1) & (data['Datetime'].dt.day >= 1) & (data['Datetime'].dt.day <= 8), 'Strike'] = 'MinS'

# February
data.loc[(data['Datetime'].dt.month == 2) & (data['Datetime'].dt.day >= 2) & (data['Datetime'].dt.day <= 21), 'Strike'] = 'Total'
data.loc[(data['Datetime'].dt.month == 2) & (data['Datetime'].dt.day == 27), 'Strike'] = 'Total'

# March
data.loc[(data['Datetime'].dt.month == 3) & (data['Datetime'].dt.day == 10), 'Strike'] = 'MinS'
data.loc[(data['Datetime'].dt.month == 3) & (data['Datetime'].dt.day >= 11) & (data['Datetime'].dt.day <= 17), 'Strike'] = 'Partial'

# April
data.loc[(data['Datetime'].dt.month == 4) & ((data['Datetime'].dt.day >= 1) & (data['Datetime'].dt.day <= 5)), 'Strike'] = 'Partial'
data.loc[(data['Datetime'].dt.month == 4) & ((data['Datetime'].dt.day >= 7) & (data['Datetime'].dt.day <= 30)), 'Strike'] = 'Partial'
data.loc[(data['Datetime'].dt.month == 4) & (data['Datetime'].dt.day == 6), 'Strike'] = 'MinS'

# May
data.loc[(data['Datetime'].dt.month == 5) & (data['Datetime'].dt.day == 30), 'Strike'] = 'Partial'
data.loc[(data['Datetime'].dt.month == 5) & (data['Datetime'].dt.day == 31), 'Strike'] = 'MinS'

# June
data.loc[(data['Datetime'].dt.month == 6) & (data['Datetime'].dt.day >= 5) & (data['Datetime'].dt.day <= 30), 'Strike'] = 'Partial'

# Display unique values in the 'Strike' column to verify the changes
data['Strike'].unique()

### Create Zone Column

In [16]:
# atribuir a cada grid a zona a que diz respeito
dicionario = {
    'Ponte Vasco da Gama': [3651, 3629],
    'A36 (Túnel do Grilo)': [3563, 3564, 3565],
    'IC16': [3319, 3320],
    'N117 (Cabos Ávila)': [933, 934, 990],
    'Marginal': [75, 76],
    'IC2 (Sacavém)': [3736, 3728],
    'A1': [3524, 3564],
    'Calçada de Carriche': [3460, 3500, 3501],
    'IC19': [1758, 1759, 1699],
    'A5': [698, 757, 758],
    'Ponte 25 Abril': [102, 103],
}


data['Zone'] = data['Grid_ID'].map({number: name for name, numbers in dicionario.items() for number in numbers})

### Create TimePeriod Column 

In [17]:
# Create conditions to identify morning and afternoon
condition1 = (data['Time'].astype(str) >= '07:00:00') & (data['Time'].astype(str) <= '9:30:00')

condition2 = (data['Time'].astype(str) >= '17:00:00') & (data['Time'].astype(str) <= '19:30:00')

data['TimePeriod'] = "Other" 

data.loc[condition1, 'TimePeriod'] = "Morning"

data.loc[condition2, 'TimePeriod'] = "Afternoon"

In [18]:
# Confirm changes
data['TimePeriod'].unique()

array(['Other', 'Morning', 'Afternoon'], dtype=object)

### Create Holiday Column
##### Binary column that assigns 'Yes' if the date is a holiday and 'No' otherwise.

In [30]:
# List of holiday dates
holiday_dates = ['2023-01-01', '2023-02-21', '2023-04-07', '2023-04-09', '2023-04-25', '2023-05-01', '2023-06-08', '2023-06-10', '2023-06-13', '2023-06-24', '2023-06-29' ] 

In [33]:
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

In [34]:
# Create 'Holiday' column
data['Holiday'] = data['Date'].isin(holiday_dates).map({True: 'Yes', False: 'No'})

In [35]:
# Display unique values in the 'Holiday' column to verify the changes
data['Holiday'].unique()

array(['Yes', 'No'], dtype=object)

### Create Weekend Column
##### Binary column that assigns 'Yes' if it's a weekend (Saturday or Sunday) and 'No' otherwise.

In [37]:
# Create 'Weekend' column
data['Weekend'] = data['Date'].dt.day_name().isin(['Saturday', 'Sunday']).map({True: 'Yes', False: 'No'})

### Merge the Dataset with the WKT File

In [38]:
# Load the WKT file
wkt=pd.read_csv('wktComplete.csv', encoding='ISO-8859-1')

In [39]:
# Rename the column to match for merging
wkt.rename(columns={'grelha_id': 'Grid_ID'}, inplace=True)

In [40]:
# Merge the data with WKT on 'Grid_ID'
data=data.merge(wkt, how='inner', on='Grid_ID')

### Export data

In [65]:
# Select columns
data=data[['Grid_ID', 'Datetime', 'C1', 'C2', 'C3', 'C4', 'Time', 'Date', 'Greve',
       'Zone', 'TimePeriod', 'Holiday', 'Weekend','freguesia','latitude', 'longitude', 'nome']]

In [66]:
data

Unnamed: 0,Grid_ID,Datetime,C1,C2,C3,C4,Time,Date,Greve,Zone,TimePeriod,Holiday,Weekend,freguesia,freguesias,latitude,longitude,nome
0,3564,2023-01-01 00:00:00+00:00,17.639999,0.000000,17.639999,0.000000,00:00:00,2023-01-01,No,A1,Other,Yes,Yes,Olivais,Santa Maria dos Olivais,38.783186,-9.122619,Segunda Circular {Relógio - A2}
1,3564,2023-01-01 00:15:00+00:00,60.020000,0.000000,6.280000,0.000000,00:15:00,2023-01-01,No,A1,Other,Yes,Yes,Olivais,Santa Maria dos Olivais,38.783186,-9.122619,Segunda Circular {Relógio - A2}
2,3564,2023-01-01 00:30:00+00:00,52.330002,0.000000,52.330002,0.000000,00:30:00,2023-01-01,No,A1,Other,Yes,Yes,Olivais,Santa Maria dos Olivais,38.783186,-9.122619,Segunda Circular {Relógio - A2}
3,3564,2023-01-01 00:45:00+00:00,11.580000,2.910000,3.420000,0.000000,00:45:00,2023-01-01,No,A1,Other,Yes,Yes,Olivais,Santa Maria dos Olivais,38.783186,-9.122619,Segunda Circular {Relógio - A2}
4,3564,2023-01-01 01:00:00+00:00,0.000000,0.000000,0.000000,0.000000,01:00:00,2023-01-01,No,A1,Other,Yes,Yes,Olivais,Santa Maria dos Olivais,38.783186,-9.122619,Segunda Circular {Relógio - A2}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376676,3563,2023-06-30 23:00:00+00:00,106.949997,45.700001,78.349998,35.650002,23:00:00,2023-06-30,Partial,A36 (Túnel do Grilo),Morning,No,No,Olivais,Santa Maria dos Olivais,38.783186,-9.124416,Figo Maduro
376677,3563,2023-06-30 23:15:00+00:00,87.839996,30.900000,72.989998,27.600000,23:15:00,2023-06-30,Partial,A36 (Túnel do Grilo),Morning,No,No,Olivais,Santa Maria dos Olivais,38.783186,-9.124416,Figo Maduro
376678,3563,2023-06-30 23:15:00+00:00,87.839996,30.900000,72.989998,27.600000,23:15:00,2023-06-30,Partial,A36 (Túnel do Grilo),Morning,No,No,Olivais,Santa Maria dos Olivais,38.783186,-9.124416,Figo Maduro
376679,3563,2023-06-30 23:30:00+00:00,103.019997,32.970001,91.400002,32.970001,23:30:00,2023-06-30,Partial,A36 (Túnel do Grilo),Morning,No,No,Olivais,Santa Maria dos Olivais,38.783186,-9.124416,Figo Maduro


In [67]:
# Export to csv
data.to_csv('data_final_greves.csv')