In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR

%matplotlib inline
import matplotlib.pyplot as plt

# Fetch Data

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']

In [None]:
df = pd.read_csv(url, names=names, delim_whitespace=True, na_values='?')

# Missing Values

In [None]:
missing = df[df.isnull().any(axis=1)]
missing # Missing values in one independent variable -> fill with median

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_numeric = df.select_dtypes(include=numerics) # Obtain subset subject to numeric imputation
df_non_numerics = df[[col for col in df.columns if col not in df_numeric.columns]] # Obtain subset not subject to numeric imputation

In [None]:
imputer = Imputer(strategy='median') # Instantiate an Imputer object
# imputer_non_numeric...

In [None]:
imputer.fit(df_numeric) # Compute the median for every series 
# ''

In [None]:
df_numeric = pd.DataFrame(imputer.transform(df_numeric), columns=df_numeric.columns)

df = df_numeric.join(df_non_numerics)

missing = df[df.isnull().any(axis=1)]
missing # No Missing -> Proceed to Training Split

# Random Sampling

In [None]:
df = df.drop('car name', axis=1) # No reason from theory to include
X, y = df.drop('mpg', axis=1), pd.DataFrame(df['mpg'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420) # ≈ random_seed

In [None]:
training_set = X_train.join(y_train)

#  Descriptive Statistics

In [None]:
training_set.describe()

In [None]:
corr_matrix = training_set.corr()
corr_matrix['mpg'].sort_values(ascending=False)

# Visualization

In [None]:
# Take a look at the attributes most correlated with mpg:
attributes = ['mpg', 'cylinders', 'horsepower', 'weight', 'displacement']
scatter_matrix(df[attributes], figsize=(20, 8))
plt.show()

In [None]:
params = {'X': 'model year', 'y': 'horsepower'}
training_set.plot(kind="scatter", x=params['X'], y=params['y'], figsize=(40, 30), alpha=1,
            s=100, label="mpg",
            c="mpg", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend() # -> Clear trend upwards of mpg over time at every weight, cylinder-count, etc.

# Normalization / Sample Classification

In [None]:
mpg_classifier = Pipeline([
        ("scalar", StandardScaler()), # Normalize all input features ≈ replace w/ zscore
        ("linear_svc", LinearSVR()), # Sample Classification method
    ])

In [None]:
mpg_classifier.fit(X_train, y_train)

In [None]:
predictions = pd.DataFrame({'predicted value': mpg_classifier.predict(X_test)})

In [None]:
results = predictions.join(y_test.reset_index(drop=True)) # Join Predictions and Test Labels
results