This is a Data Science Exploration 

The data set is from the UCI Machine Learning Repository located [here.](https://archive.ics.uci.edu/ml/index.php)


# Load the Data

In [104]:
import pandas as pd
import numpy as np
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, header = None)
#attribute names from https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names

headers = ["symboling","nomalized-losses","make","fule-type","aspiration","num-of-doors",
           "body-styple","drive-wheels", "engine-location", "wheel-base","length","width", 
           "height","curb-weight","engine-type", "num-of-cylinders","engine-size",
           "fuel-system","bore","stroke", "compression-ratio","horsepower","peak-rpm",
           "city-mpg","highway-mpg","price"]
df.columns=headers

In [105]:
print(df.head(10)) 



   symboling nomalized-losses         make fule-type aspiration num-of-doors  \
0          3                ?  alfa-romero       gas        std          two   
1          3                ?  alfa-romero       gas        std          two   
2          1                ?  alfa-romero       gas        std          two   
3          2              164         audi       gas        std         four   
4          2              164         audi       gas        std         four   
5          2                ?         audi       gas        std          two   
6          1              158         audi       gas        std         four   
7          1                ?         audi       gas        std         four   
8          1              158         audi       gas      turbo         four   
9          0                ?         audi       gas      turbo          two   

   body-styple drive-wheels engine-location  wheel-base  ...    engine-size  \
0  convertible          rwd           fr

From https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
 Number of Attributes: 26 total
   -- 15 continuous
   -- 1 integer
   -- 10 nominal
   
  1. symboling:                -3, -2, -1, 0, 1, 2, 3.
  2. normalized-losses:        continuous from 65 to 256.
  3. make:                     alfa-romero, audi, bmw, chevrolet, dodge, honda,
                               isuzu, jaguar, mazda, mercedes-benz, mercury,
                               mitsubishi, nissan, peugot, plymouth, porsche,
                               renault, saab, subaru, toyota, volkswagen, volvo
  4. fuel-type:                diesel, gas.
  5. aspiration:               std, turbo.
  6. num-of-doors:             four, two.
  7. body-style:               hardtop, wagon, sedan, hatchback, convertible.
  8. drive-wheels:             4wd, fwd, rwd.
  9. engine-location:          front, rear.
 10. wheel-base:               continuous from 86.6 120.9.
 11. length:                   continuous from 141.1 to 208.1.
 12. width:                    continuous from 60.3 to 72.3.
 13. height:                   continuous from 47.8 to 59.8.
 14. curb-weight:              continuous from 1488 to 4066.
 15. engine-type:              dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
 16. num-of-cylinders:         eight, five, four, six, three, twelve, two.
 17. engine-size:              continuous from 61 to 326.
 18. fuel-system:              1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
 19. bore:                     continuous from 2.54 to 3.94.
 20. stroke:                   continuous from 2.07 to 4.17.
 21. compression-ratio:        continuous from 7 to 23.
 22. horsepower:               continuous from 48 to 288.
 23. peak-rpm:                 continuous from 4150 to 6600.
 24. city-mpg:                 continuous from 13 to 49.
 25. highway-mpg:              continuous from 16 to 54.
 26. price:                    continuous from 5118 to 45400.


In [106]:
print(df.dtypes)


symboling              int64
nomalized-losses      object
make                  object
fule-type             object
aspiration            object
num-of-doors          object
body-styple           object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object


The default pandas reader loaded some data types wrong. We should have
   -- 15 continuous
   -- 1 integer
   -- 10 nominal
Lets set some values to numeric.

In [107]:
#Fix data types
#Lets verify everything is correct now
#The first thing we do is replace all "?" values with NaNs
#Then we find the column mean
#Then re replace the NaN with each mean

df["nomalized-losses"] = pd.to_numeric(df["nomalized-losses"],errors='coerce')
mean = df["nomalized-losses"].mean()
df["nomalized-losses"].replace(np.nan, mean,inplace=True)
df["bore"] = pd.to_numeric(df["bore"],errors='coerce')
mean = df["bore"].mean()
df["bore"].replace(np.nan, mean,inplace=True)
df["stroke"] = pd.to_numeric(df["stroke"],errors='coerce')
mean = df["stroke"].mean()
df["stroke"].replace(np.nan, mean,inplace=True)
df["horsepower"] = pd.to_numeric(df["horsepower"],errors='coerce')
mean = df["horsepower"].mean()
df["horsepower"].replace(np.nan, mean,inplace=True)
df["peak-rpm"] = pd.to_numeric(df["peak-rpm"],errors='coerce')
mean = df["peak-rpm"].mean()
df["peak-rpm"].replace(np.nan, mean,inplace=True)

#This one is already the right category.
df["num-of-doors"].replace("?", "four",inplace=True)

#Special case, the target
#Replace missing values with NA
df["price"] = pd.to_numeric(df["price"],errors='coerce')
#Drop missing values, since we need price, we have to drop any missing price values
df.dropna(subset=["price"],axis=0,inplace=True)


print(df.dtypes)

symboling              int64
nomalized-losses     float64
make                  object
fule-type             object
aspiration            object
num-of-doors          object
body-styple           object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object


Now that everything is verified, lets see the data.
In this we look for any crazy max, min or crazy standard deviation. 

In [108]:
#Drop missing values, since we need price, we have to drop any missing price values
print(df.shape)
df.dropna(subset=["price"],axis=0,inplace=True)
print(df.shape)

(201, 26)
(201, 26)


So we removed 45 entries for missing values

In [109]:
print(df.describe(include="all"))

         symboling  nomalized-losses    make fule-type aspiration  \
count   201.000000         201.00000     201       201        201   
unique         NaN               NaN      22         2          2   
top            NaN               NaN  toyota       gas        std   
freq           NaN               NaN      32       181        165   
mean      0.840796         122.00000     NaN       NaN        NaN   
std       1.254802          31.99625     NaN       NaN        NaN   
min      -2.000000          65.00000     NaN       NaN        NaN   
25%       0.000000         101.00000     NaN       NaN        NaN   
50%       1.000000         122.00000     NaN       NaN        NaN   
75%       2.000000         137.00000     NaN       NaN        NaN   
max       3.000000         256.00000     NaN       NaN        NaN   

       num-of-doors body-styple drive-wheels engine-location  wheel-base  \
count           201         201          201             201  201.000000   
unique            2