### Imports

In [1]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

### Session start and Data load

In [2]:
# Create a Spark session
spark = SparkSession.builder.appName("DataLoadingExample").getOrCreate()

# Get the input data location from the command line or configuration
input_data_location = "data/1987.csv"

# Load the data into a PySpark DataFrame
df = spark.read.csv(input_data_location, header=True, inferSchema=True)

### Data Preprocessing

In [3]:
# List of columns to be removed
columns_to_remove = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted',
                     'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']


# Remove columns with only one unique value
for col in [col for col in df.columns if col not in columns_to_remove]:
    if df.select(col).distinct().count() == 1:
        print("Column '{}' has only one unique value".format(col))
        columns_to_remove.append(col)
        
# Select columns that are NOT in the 'columns_to_remove' list
df = df.select([col for col in df.columns if col not in columns_to_remove])

# Identify numerical and categorical columns
categorical_cols = ['UniqueCarrier', 'Origin', 'Dest']
numerical_cols = [x for x in df.columns if x not in categorical_cols]

target_var = 'ArrDelay'

# Remove the target variable 'ArrDelay' from the lists
if target_var in numerical_cols:
    numerical_cols.remove(target_var)
if target_var in categorical_cols:
    categorical_cols.remove(target_var)
    
# Print the lists
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)


Column 'Year' has only one unique value
Column 'TailNum' has only one unique value
Column 'TaxiOut' has only one unique value
Column 'CancellationCode' has only one unique value
Numerical Columns: ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'CRSArrTime', 'FlightNum', 'CRSElapsedTime', 'DepDelay', 'Distance', 'Cancelled']
Categorical Columns: ['UniqueCarrier', 'Origin', 'Dest']


In [4]:
# Label encode categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_encoded") for col in categorical_cols]
pipeline = Pipeline(stages=indexers)
df_encoded = pipeline.fit(df).transform(df)

# Drop the original categorical columns
df_encoded = df_encoded.drop(*categorical_cols)

# Show the DataFrame with label-encoded categorical columns
df_encoded.show()

+-----+----------+---------+-------+----------+----------+---------+--------------+--------+--------+--------+---------+---------------------+--------------+------------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Distance|Cancelled|UniqueCarrier_encoded|Origin_encoded|Dest_encoded|
+-----+----------+---------+-------+----------+----------+---------+--------------+--------+--------+--------+---------+---------------------+--------------+------------+
|   10|        14|        3|    741|       730|       849|     1451|            79|      23|      11|     447|        0|                 11.0|          28.0|         5.0|
|   10|        15|        4|    729|       730|       849|     1451|            79|      14|      -1|     447|        0|                 11.0|          28.0|         5.0|
|   10|        17|        6|    741|       730|       849|     1451|            79|      29|      11|     447|        0|                 11.0|   

### Finish Session

In [5]:
# Close context
spark.stop()