In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# TASK 1.1 CLEANING THE DATA
airline = pd.read_csv('airline_small.csv', encoding='ISO-8859-1', low_memory = False)

bad_columns = ['TailNum', 'Cancelled', 'Diverted', 'TaxiIn', 'TaxiOut', 'CancellationCode', 'CarrierDelay',
               'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

airline_sub = airline[airline.columns.difference(bad_columns)]
airline_clean = airline_sub.dropna()
airline_clean = airline_clean.select_dtypes(['number'])

# TASK 1.2 CREATING TRAINING AND TEST SETS
target = (airline_clean['ArrDelay'] > 20).astype(int)
features = airline_clean[airline_clean.columns.difference(['ArrDelay'])]
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

# TASK 1.3 STANDARDIZE THE DATA
x_train_mean = x_train.mean()
x_train_std = x_train.std()
x_train_norm = (x_train - x_train_mean) / x_train_std
x_test_norm = (x_test - x_train_mean) / x_train_std

# TASK 1.4 PCA
pca = PCA()
pca.fit(x_train_norm)
explain_var = 100 * pca.explained_variance_ratio_
cum_var = np.cumsum(explain_var)
good_dimensions = (cum_var <= 85).astype(int)
n_components = sum(good_dimensions) + 1

# TASK 1.5 PCA TO REDUCE THE DIMENSIONALITY OF OUR DATA
pca = PCA(n_components = n_components)
pca.fit(x_train_norm)


x_train_white = pca.transform(x_train_norm)
x_test_white = pca.transform(x_test_norm)

print(x_train_white[:5])
print(x_test_white[:5])